コード例 #1
0
    def test_flops_params(self):
        class Model1(nn.Module):
            def __init__(self):
                super(Model1, self).__init__()
                self.conv = nn.Conv2d(3, 5, 1, 1)
                self.bn = nn.BatchNorm2d(5)
                self.relu = nn.LeakyReLU()
                self.linear = nn.Linear(20, 10)
                self.upsample = nn.UpsamplingBilinear2d(size=2)
                self.pool = nn.AdaptiveAvgPool2d((2, 2))

            def forward(self, x):
                x = self.conv(x)
                x = self.bn(x)
                x = self.relu(x)
                x = self.upsample(x)
                x = self.pool(x)
                x = x.view(x.size(0), -1)
                x = self.linear(x)
                return x

        class Model2(nn.Module):
            def __init__(self):
                super(Model2, self).__init__()
                self.conv = nn.Conv2d(3, 5, 1, 1)
                self.conv2 = nn.Conv2d(5, 5, 1, 1)

            def forward(self, x):
                x = self.conv(x)
                for _ in range(5):
                    x = self.conv2(x)
                return x

        for bs in [1, 2]:
            flops, params, results = count_flops_params(Model1(),
                                                        (bs, 3, 2, 2),
                                                        mode='full',
                                                        verbose=False)
            assert (flops, params) == (610, 240)

            flops, params, results = count_flops_params(Model2(),
                                                        (bs, 3, 2, 2),
                                                        verbose=False)
            assert (flops, params) == (560, 50)

            from torchvision.models import resnet50
            flops, params, results = count_flops_params(resnet50(),
                                                        (bs, 3, 224, 224),
                                                        verbose=False)
            assert (flops, params) == (4089184256, 25503912)
コード例 #2
0
ファイル: amc_env.py プロジェクト: maxpark/nni
    def step(self, action: float, model: Module):
        _, _, current_statistics = count_flops_params(model,
                                                      self.dummy_input,
                                                      verbose=False)
        current_statistics = {
            result['name']: result
            for result in current_statistics
        }
        index = self.pruning_op_names.index(self.current_op_name)
        action = 1 - current_statistics[self.current_op_name][
            self.target] / self.current_op_target

        total_current_target = sum([
            current_statistics[name][self.target]
            for name in self.pruning_op_names
        ])
        previous_pruning_target = self.under_pruning_target - total_current_target
        rest_target = sum([
            current_statistics[name][self.target]
            for name in self.pruning_op_names[index + 1:]
        ])

        self.layer_embedding[index][
            -3] = previous_pruning_target / self.under_pruning_target  # reduced
        self.layer_embedding[index][
            -2] = rest_target / self.under_pruning_target  # rest
        self.layer_embedding[index][-1] = action  # last action
        observation = self.layer_embedding[index, :].copy()

        return action, 0, observation, self.is_final_layer()
コード例 #3
0
ファイル: amc_env.py プロジェクト: yinfupai/nni
    def correct_action(self, action: float, model: Module):
        try:
            op_name = next(self.ops_iter)
            index = self.pruning_op_names.index(op_name)
            _, _, current_statistics = count_flops_params(model, self.dummy_input, verbose=False)
            current_statistics = {result['name']: result for result in current_statistics}

            total_current_target = sum([current_statistics[name][self.target] for name in self.pruning_op_names])
            previous_pruning_target = self.under_pruning_target - total_current_target
            max_rest_pruning_target = sum([current_statistics[name][self.target] * self.max_sparsity_per_layer[name] for name in self.pruning_op_names[index + 1:]])
            min_current_pruning_target = self.excepted_pruning_target - previous_pruning_target - max_rest_pruning_target
            max_current_pruning_target_1 = self.origin_statistics[op_name][self.target] * self.max_sparsity_per_layer[op_name] - (self.origin_statistics[op_name][self.target] - current_statistics[op_name][self.target])
            max_current_pruning_target_2 = self.excepted_pruning_target - previous_pruning_target
            max_current_pruning_target = min(max_current_pruning_target_1, max_current_pruning_target_2)
            min_action = min_current_pruning_target / current_statistics[op_name][self.target]
            max_action = max_current_pruning_target / current_statistics[op_name][self.target]
            if min_action > self.max_sparsity_per_layer[op_name]:
                _logger.warning('[%s] min action > max sparsity per layer: %f > %f', op_name, min_action, self.max_sparsity_per_layer[op_name])
            action = max(0., min(max_action, max(min_action, action)))

            self.current_op_name = op_name
            self.current_op_target = current_statistics[op_name][self.target]
        except StopIteration:
            raise Error('Something goes wrong, this should not happen.')
        return action
コード例 #4
0
ファイル: amc_env.py プロジェクト: yinfupai/nni
    def __init__(self, model: Module, config_list: List[Dict], dummy_input: Tensor, total_sparsity: float, max_sparsity_per_layer: Dict[str, float], target: str = 'flops'):
        pruning_op_names = []
        [pruning_op_names.extend(config['op_names']) for config in config_list_canonical(model, config_list)]
        self.pruning_ops = OrderedDict()
        self.pruning_types = []
        for i, (name, layer) in enumerate(model.named_modules()):
            if name in pruning_op_names:
                op_type = type(layer).__name__
                stride = np.power(np.prod(layer.stride), 1 / len(layer.stride)) if hasattr(layer, 'stride') else 0  # type: ignore
                kernel_size = np.power(np.prod(layer.kernel_size), 1 / len(layer.kernel_size)) if hasattr(layer, 'kernel_size') else 1  # type: ignore
                self.pruning_ops[name] = (i, op_type, stride, kernel_size)
                self.pruning_types.append(op_type)
        self.pruning_types = list(set(self.pruning_types))
        self.pruning_op_names = list(self.pruning_ops.keys())
        self.dummy_input = dummy_input

        self.total_sparsity = total_sparsity
        self.max_sparsity_per_layer = max_sparsity_per_layer
        assert target in ['flops', 'params']
        self.target = target

        self.origin_target, self.origin_params_num, origin_statistics = count_flops_params(model, dummy_input, verbose=False)
        self.origin_statistics = {result['name']: result for result in origin_statistics}

        self.under_pruning_target = sum([self.origin_statistics[name][self.target] for name in self.pruning_op_names])
        self.excepted_pruning_target = self.total_sparsity * self.under_pruning_target
コード例 #5
0
ファイル: utils.py プロジェクト: yinfupai/nni
def count_flops(model, log=None, device=None):
    dummy_input = torch.rand([1, 3, 256, 256])
    if device is not None:
        dummy_input = dummy_input.to(device)
    flops, params, results = count_flops_params(model, dummy_input)
    print(f"FLOPs: {flops}, params: {params}")
    if log is not None:
        log.write(f"FLOPs: {flops}, params: {params}\n")
    return flops, params
コード例 #6
0
def generate_compression_search_space(
        config: CompressionConfig,
        vessel: CompressionVessel) -> Dict[str, Dict]:
    """
    Using config (constraints & priori) and vessel (model-related) to generate the hpo search space.
    """

    search_space = {}
    model, _, evaluator, dummy_input, _, _, _, _ = vessel.export()
    flops, params, results = count_flops_params(model,
                                                dummy_input,
                                                verbose=False,
                                                mode='full')
    metric = evaluator(model)

    module_names_summary = _summary_module_names(model, config.module_types,
                                                 config.module_names,
                                                 config.exclude_module_names)
    for module_name in module_names_summary:
        search_space['{}{}'.format(KEY_MODULE_NAME, module_name)] = {
            '_type': 'uniform',
            '_value': [0, 1]
        }

    assert not config.pruners or not config.quantizers

    # TODO: hard code for step 1, need refactor
    search_space[KEY_PRUNERS] = {
        '_type': 'choice',
        '_value': [pruner_config.json() for pruner_config in config.pruners]
    }

    original_target = {
        'flops': flops,
        'params': params,
        'metric': metric,
        'results': results
    }

    # TODO: following fucntion need improvement
    flops_theta = _flops_theta_helper(config.flops, flops)
    params_theta = _flops_theta_helper(config.params, params)
    metric_theta = _metric_theta_helper(config.metric, metric)
    thetas = {
        'flops': flops_theta,
        'params': params_theta,
        'metric': metric_theta
    }

    search_space[KEY_VESSEL] = {'_type': 'choice', '_value': [vessel.json()]}
    search_space[KEY_ORIGINAL_TARGET] = {
        '_type': 'choice',
        '_value': [original_target]
    }
    search_space[KEY_THETAS] = {'_type': 'choice', '_value': [thetas]}
    return search_space
コード例 #7
0
ファイル: utils.py プロジェクト: yinfupai/nni
    def _calculate_flops(self, eps=0.001):
        """FLOPs cost."""
        flops_lut = [{} for i in range(self.cnt_layers)]
        layer_id = 0

        for stage_name in self.lut_ops:
            stage_ops = self.lut_ops[stage_name]
            ops_num = self.layer_num[stage_name]

            for _ in range(ops_num):
                for op_name in stage_ops:
                    layer_config = self.layer_configs[layer_id]
                    key_params = {"fm_size": layer_config[3]}
                    op = stage_ops[op_name](*layer_config[0:3], **key_params)

                    # measured in Flops
                    in_shape = self.layer_in_shapes[layer_id]
                    x = (1, in_shape[0], in_shape[1], in_shape[2])
                    flops, _, _ = count_flops_params(op, x, verbose=False)
                    flops = eps if flops == 0.0 else flops
                    flops_lut[layer_id][op_name] = float(flops)
                layer_id += 1

        return flops_lut
コード例 #8
0
ファイル: amc_pruner.py プロジェクト: maxpark/nni
    def generate_tasks(self, task_result: TaskResult) -> List[Task]:
        # append experience & update agent policy
        if self.action is not None:
            action, reward, observation, done = self.env.step(self.action, task_result.compact_model)
            self.T.append([reward, self.observation, observation, self.action, done])
            self.observation = observation.copy()

            if done:
                assert task_result.score is not None, 'task_result.score should not be None if environment is done.'
                final_reward = task_result.score - 1
                # agent observe and update policy
                for _, s_t, s_t1, a_t, d_t in self.T:
                    self.agent.observe(final_reward, s_t, s_t1, a_t, d_t)
                    if self.current_episode > self.warmup_episode:
                        self.agent.update_policy()

                self.current_episode += 1
                self.T = []
                self.action = None
                self.observation = None

            # update current2origin_sparsity in log file
            origin_model = torch.load(self._origin_model_path)
            compact_model = task_result.compact_model
            compact_model_masks = task_result.compact_model_masks
            current2origin_sparsity, _, _ = compute_sparsity(origin_model, compact_model, compact_model_masks, self.temp_config_list)
            self._tasks[task_result.task_id].state['current2origin_sparsity'] = current2origin_sparsity
            current2origin_sparsity, _, _ = compute_sparsity(origin_model, compact_model, compact_model_masks, self.config_list_copy)
            self._tasks[task_result.task_id].state['current_total_sparsity'] = current2origin_sparsity
            flops, params, _ = count_flops_params(compact_model, self.dummy_input, verbose=False)
            self._tasks[task_result.task_id].state['current_flops'] = '{:.2f} M'.format(flops / 1e6)
            self._tasks[task_result.task_id].state['current_params'] = '{:.2f} M'.format(params / 1e6)

        # generate new action
        if self.current_episode < self.total_episode:
            if self.observation is None:
                self.observation = self.env.reset().copy()
                self.temp_config_list = []
                compact_model = torch.load(self._origin_model_path)
                compact_model_masks = torch.load(self._origin_masks_path)
            else:
                compact_model = task_result.compact_model
                compact_model_masks = task_result.compact_model_masks
            if self.current_episode <= self.warmup_episode:
                action = self.agent.random_action()
            else:
                action = self.agent.select_action(self.observation, episode=self.current_episode)
            action = action.tolist()[0]

            self.action = self.env.correct_action(action, compact_model)
            sub_config_list = [{'op_names': [self.env.current_op_name], 'total_sparsity': self.action}]
            self.temp_config_list.extend(sub_config_list)

            task_id = self._task_id_candidate
            if self.env.is_first_layer() or self.env.is_final_layer():
                task_config_list = self.temp_config_list
            else:
                task_config_list = sub_config_list

            config_list_path = Path(self._intermediate_result_dir, '{}_config_list.json'.format(task_id))
            with Path(config_list_path).open('w') as f:
                json_tricks.dump(task_config_list, f, indent=4)

            model_path = Path(self._intermediate_result_dir, '{}_compact_model.pth'.format(task_result.task_id))
            masks_path = Path(self._intermediate_result_dir, '{}_compact_model_masks.pth'.format(task_result.task_id))
            torch.save(compact_model, model_path)
            torch.save(compact_model_masks, masks_path)

            task = Task(task_id, model_path, masks_path, config_list_path)
            if not self.env.is_final_layer():
                task.finetune = False
                task.evaluate = False

            self._tasks[task_id] = task
            self._task_id_candidate += 1
            return [task]
        else:
            return []
コード例 #9
0
ファイル: fpgm_pruning_torch.py プロジェクト: yinfupai/nni
        model, total_epoch=args.pretrain_epochs)
    criterion = torch.nn.CrossEntropyLoss()
    pre_best_acc = 0.0
    best_state_dict = None

    for i in range(args.pretrain_epochs):
        trainer(model, optimizer, criterion)
        scheduler.step()
        acc = evaluator(model)
        if acc > pre_best_acc:
            pre_best_acc = acc
            best_state_dict = model.state_dict()
    print("Best accuracy: {}".format(pre_best_acc))
    model.load_state_dict(best_state_dict)
    pre_flops, pre_params, _ = count_flops_params(
        model,
        torch.randn([128, 3, 32, 32]).to(device))
    g_epoch = 0

    # Start to prune and speedup
    print('\n' + '=' * 50 +
          ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
    config_list = [{'sparsity': 0.5, 'op_types': ['Conv2d']}]
    pruner = FPGMPruner(model, config_list)
    _, masks = pruner.compress()
    pruner.show_pruned_weights()
    pruner._unwrap_model()
    ModelSpeedup(model,
                 dummy_input=torch.rand([10, 3, 32, 32]).to(device),
                 masks_file=masks).speedup_model()
    print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
コード例 #10
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args = parse_args()

    #########################################################################
    # Prepare model, tokenizer, dataset, optimizer, and the scheduler
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()

    # Load dataset and tokenizer, and then preprocess the dataset
    raw_dataset, is_regression, num_labels = get_raw_dataset(args.task_name)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
    processed_datasets = preprocess(args, tokenizer, raw_dataset)
    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation_matched" if args.task_name ==
                                      "mnli" else "validation"]

    # Load pretrained model
    config = AutoConfig.from_pretrained(
        args.model_name, num_labels=num_labels, finetuning_task=args.task_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name, config=config)
    model.to(device)

    #########################################################################
    # Finetune on the target GLUE task before pruning
    optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(args, tokenizer,
                                                                                               model,
                                                                                               train_dataset,
                                                                                               eval_dataset)
    train_steps = args.num_train_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps,
                                 num_training_steps=train_steps)
    metric = load_metric("glue", args.task_name)

    logger.info("================= Finetuning before pruning =================")
    train_model(args, model, is_regression, train_dataloader,
                eval_dataloader, optimizer, lr_scheduler, metric, device)

    if args.output_dir is not None:
        torch.save(model.state_dict(), args.output_dir + "/model_before_pruning.pt")

    if args.task_name == "mnli":
        final_eval_for_mnli(args, model, processed_datasets, metric, data_collator)

    #########################################################################
    # Pruning
    optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(args, tokenizer,
                                                                                               model,
                                                                                               train_dataset,
                                                                                               eval_dataset)
    dummy_input = next(iter(train_dataloader))["input_ids"].to(device)
    flops, params, results = count_flops_params(model, dummy_input)
    print(f"Initial model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")

    # Here criterion is embedded in the model. Upper levels can just pass None to trainer.
    def trainer(model, optimizer, criterion, epoch):
        return trainer_helper(model, train_dataloader, optimizer, device)

    def forward_runner(model):
        return forward_runner_helper(model, train_dataloader, device)

    # example: prune different layers with different sparsity
    attention_name_groups = list(zip(["bert.encoder.layer.{}.attention.self.query".format(i) for i in range(12)],
                                     ["bert.encoder.layer.{}.attention.self.key".format(i) for i in range(12)],
                                     ["bert.encoder.layer.{}.attention.self.value".format(i) for i in range(12)],
                                     ["bert.encoder.layer.{}.attention.output.dense".format(i) for i in range(12)]))

    kwargs = {"ranking_criterion": args.ranking_criterion,
              "global_sort": args.global_sort,
              "num_iterations": args.num_iterations,
              "epochs_per_iteration": args.epochs_per_iteration,
              "attention_name_groups": attention_name_groups,
              "head_hidden_dim": 64,
              "trainer": trainer,
              "optimizer": optimizer,
              "forward_runner": forward_runner}

    config_list = [{
        "sparsity": args.sparsity,
        "op_types": ["Linear"],
        "op_names": [x for layer in attention_name_groups[:6] for x in layer]
    }, {
        "sparsity": args.sparsity / 2,
        "op_types": ["Linear"],
        "op_names": [x for layer in attention_name_groups[6:] for x in layer]
    }]

    pruner = TransformerHeadPruner(model, config_list, **kwargs)
    pruner.compress()

    #########################################################################
    # uncomment the following part to export the pruned model masks
    # model_path = os.path.join(args.output_dir, "pruned_{}_{}.pth".format(args.model_name, args.task_name))
    # mask_path = os.path.join(args.output_dir, "mask_{}_{}.pth".format(args.model_name, args.task_name))
    # pruner.export_model(model_path=model_path, mask_path=mask_path)

    #########################################################################
    # Speedup
    # Currently, speeding up Transformers through NNI ModelSpeedup is not supported because of shape inference issues.
    # However, if you are using the transformers library, you can use the following workaround:
    # The following code gets the head pruning decisions from the pruner and calls the _prune_heads() function
    # implemented in models from the transformers library to speedup the model.
    if args.speedup:
        speedup_rules = {}
        for group_idx, group in enumerate(pruner.attention_name_groups):
            # get the layer index
            layer_idx = None
            for part in group[0].split("."):
                try:
                    layer_idx = int(part)
                    break
                except:
                    continue
            if layer_idx is not None:
                speedup_rules[layer_idx] = pruner.pruned_heads[group_idx]
        pruner._unwrap_model()
        model.bert._prune_heads(speedup_rules)
        print(model)

    #########################################################################
    # After pruning, finetune again on the target task
    # Get the metric function
    metric = load_metric("glue", args.task_name)

    # re-initialize the optimizer and the scheduler
    optimizer, _, _, data_collator = get_dataloader_and_optimizer(args, tokenizer, model, train_dataset,
                                                                  eval_dataset)
    lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps,
                                 num_training_steps=train_steps)

    logger.info("================= Finetuning after Pruning =================")
    train_model(args, model, is_regression, train_dataloader,
                eval_dataloader, optimizer, lr_scheduler, metric, device)

    if args.output_dir is not None:
        torch.save(model.state_dict(), args.output_dir +
                   "/model_after_pruning.pt")

    if args.task_name == "mnli":
        final_eval_for_mnli(args, model, processed_datasets,
                            metric, data_collator)

    flops, params, results = count_flops_params(model, dummy_input)
    print(f"Final model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")
コード例 #11
0
def main(args):
    # prepare dataset
    torch.manual_seed(0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader, val_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size)
    model, optimizer = get_trained_model_optimizer(args, device, train_loader, val_loader, criterion)

    def short_term_fine_tuner(model, epochs=1):
        for epoch in range(epochs):
            train(args, model, device, train_loader, criterion, optimizer, epoch)

    def trainer(model, optimizer, criterion, epoch):
        return train(args, model, device, train_loader, criterion, optimizer, epoch=epoch)

    def evaluator(model):
        return test(model, device, criterion, val_loader)

    # used to save the performance of the original & pruned & finetuned models
    result = {'flops': {}, 'params': {}, 'performance':{}}

    flops, params, _ = count_flops_params(model, get_input_size(args.dataset))
    result['flops']['original'] = flops
    result['params']['original'] = params

    evaluation_result = evaluator(model)
    print('Evaluation result (original model): %s' % evaluation_result)
    result['performance']['original'] = evaluation_result

    # module types to prune, only "Conv2d" supported for channel pruning
    if args.base_algo in ['l1', 'l2', 'fpgm']:
        op_types = ['Conv2d']
    elif args.base_algo == 'level':
        op_types = ['default']

    config_list = [{
        'sparsity': args.sparsity,
        'op_types': op_types
    }]
    dummy_input = get_dummy_input(args, device)
    if args.pruner == 'L1FilterPruner':
        pruner = L1FilterPruner(model, config_list)
    elif args.pruner == 'L2FilterPruner':
        pruner = L2FilterPruner(model, config_list)
    elif args.pruner == 'FPGMPruner':
        pruner = FPGMPruner(model, config_list)
    elif args.pruner == 'NetAdaptPruner':
        pruner = NetAdaptPruner(model, config_list, short_term_fine_tuner=short_term_fine_tuner, evaluator=evaluator,
                                base_algo=args.base_algo, experiment_data_dir=args.experiment_data_dir)
    elif args.pruner == 'ADMMPruner':
        # users are free to change the config here
        if args.model == 'LeNet':
            if args.base_algo in ['l1', 'l2', 'fpgm']:
                config_list = [{
                    'sparsity': 0.8,
                    'op_types': ['Conv2d'],
                    'op_names': ['conv1']
                }, {
                    'sparsity': 0.92,
                    'op_types': ['Conv2d'],
                    'op_names': ['conv2']
                }]
            elif args.base_algo == 'level':
                config_list = [{
                    'sparsity': 0.8,
                    'op_names': ['conv1']
                }, {
                    'sparsity': 0.92,
                    'op_names': ['conv2']
                }, {
                    'sparsity': 0.991,
                    'op_names': ['fc1']
                }, {
                    'sparsity': 0.93,
                    'op_names': ['fc2']
                }]
        else:
            raise ValueError('Example only implemented for LeNet.')
        pruner = ADMMPruner(model, config_list, trainer=trainer, num_iterations=2, epochs_per_iteration=2)
    elif args.pruner == 'SimulatedAnnealingPruner':
        pruner = SimulatedAnnealingPruner(
            model, config_list, evaluator=evaluator, base_algo=args.base_algo,
            cool_down_rate=args.cool_down_rate, experiment_data_dir=args.experiment_data_dir)
    elif args.pruner == 'AutoCompressPruner':
        pruner = AutoCompressPruner(
            model, config_list, trainer=trainer, evaluator=evaluator, dummy_input=dummy_input,
            num_iterations=3, optimize_mode='maximize', base_algo=args.base_algo,
            cool_down_rate=args.cool_down_rate, admm_num_iterations=30, admm_epochs_per_iteration=5,
            experiment_data_dir=args.experiment_data_dir)
    else:
        raise ValueError(
            "Pruner not supported.")

    # Pruner.compress() returns the masked model
    # but for AutoCompressPruner, Pruner.compress() returns directly the pruned model
    model = pruner.compress()
    evaluation_result = evaluator(model)
    print('Evaluation result (masked model): %s' % evaluation_result)
    result['performance']['pruned'] = evaluation_result

    if args.save_model:
        pruner.export_model(
            os.path.join(args.experiment_data_dir, 'model_masked.pth'), os.path.join(args.experiment_data_dir, 'mask.pth'))
        print('Masked model saved to %s' % args.experiment_data_dir)

    # model speedup
    if args.speedup:
        if args.pruner != 'AutoCompressPruner':
            if args.model == 'LeNet':
                model = LeNet().to(device)
            elif args.model == 'vgg16':
                model = VGG(depth=16).to(device)
            elif args.model == 'resnet18':
                model = ResNet18().to(device)
            elif args.model == 'resnet50':
                model = ResNet50().to(device)

            model.load_state_dict(torch.load(os.path.join(args.experiment_data_dir, 'model_masked.pth')))
            masks_file = os.path.join(args.experiment_data_dir, 'mask.pth')

            m_speedup = ModelSpeedup(model, dummy_input, masks_file, device)
            m_speedup.speedup_model()
            evaluation_result = evaluator(model)
            print('Evaluation result (speedup model): %s' % evaluation_result)
            result['performance']['speedup'] = evaluation_result

            torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_speedup.pth'))
            print('Speedup model saved to %s' % args.experiment_data_dir)
        flops, params, _ = count_flops_params(model, get_input_size(args.dataset))
        result['flops']['speedup'] = flops
        result['params']['speedup'] = params

    if args.fine_tune:
        if args.dataset == 'mnist':
            optimizer = torch.optim.Adadelta(model.parameters(), lr=1)
            scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
        elif args.dataset == 'cifar10' and args.model == 'vgg16':
            optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
            scheduler = MultiStepLR(
                optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1)
        elif args.dataset == 'cifar10' and args.model == 'resnet18':
            optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
            scheduler = MultiStepLR(
                optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1)
        elif args.dataset == 'cifar10' and args.model == 'resnet50':
            optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
            scheduler = MultiStepLR(
                optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1)
        best_acc = 0
        for epoch in range(args.fine_tune_epochs):
            train(args, model, device, train_loader, criterion, optimizer, epoch)
            scheduler.step()
            acc = evaluator(model)
            if acc > best_acc:
                best_acc = acc
                torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_fine_tuned.pth'))

    print('Evaluation result (fine tuned): %s' % best_acc)
    print('Fined tuned model saved to %s' % args.experiment_data_dir)
    result['performance']['finetuned'] = best_acc

    with open(os.path.join(args.experiment_data_dir, 'result.json'), 'w+') as f:
        json.dump(result, f)
コード例 #12
0
ファイル: trial_entry.py プロジェクト: yinfupai/nni
from .config.utils import parse_params, parse_basic_pruner

# TODO: move this function to evaluate module
def sigmoid(x: float, theta0: float = -0.5, theta1: float = 10) -> float:
    return 1 / (1 + math.exp(-theta1 * (x + theta0)))

if __name__ == '__main__':
    kwargs = nni.get_next_parameter()
    pruner_config, config_list, vessel, original_target, thetas = parse_params(kwargs)
    basic_pruner, model, finetuner, evaluator, dummy_input, device = parse_basic_pruner(pruner_config, config_list, vessel)

    # TODO: move following logic to excution engine
    log_dir = Path(os.environ['NNI_OUTPUT_DIR']) if 'NNI_OUTPUT_DIR' in os.environ else Path('nni_outputs', 'log')
    task_generator = AGPTaskGenerator(total_iteration=3, origin_model=model, origin_config_list=config_list,
                                      skip_first_iteration=True, log_dir=log_dir)
    speedup = dummy_input is not None
    scheduler = PruningScheduler(pruner=basic_pruner, task_generator=task_generator, finetuner=finetuner, speedup=speedup,
                                 dummy_input=dummy_input, evaluator=None)
    scheduler.compress()
    _, model, _, _, _ = scheduler.get_best_result()
    metric = evaluator(model)
    flops, params, _ = count_flops_params(model, dummy_input, verbose=False, mode='full')

    # TODO: more efficient way to calculate or combine these scores
    flops_score = sigmoid(flops / original_target['flops'], *thetas['flops'])
    params_score = sigmoid(params / original_target['params'], *thetas['params'])
    metric_score = sigmoid(metric / original_target['metric'], *thetas['metric'])
    final_result = flops_score + params_score + metric_score

    nni.report_final_result({'default': final_result, 'flops': flops, 'params': params, 'metric': metric})
コード例 #13
0
ファイル: end2end_compression.py プロジェクト: yinfupai/nni
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(args.experiment_data_dir, exist_ok=True)

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=True, download=True, transform=transform),
        batch_size=64,)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=False, transform=transform),
        batch_size=1000)

    # Step1. Model Pretraining
    model = NaiveModel().to(device)
    criterion = torch.nn.NLLLoss()
    optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
    flops, params, _ = count_flops_params(model, (1, 1, 28, 28), verbose=False)

    if args.pretrained_model_dir is None:
        args.pretrained_model_dir = os.path.join(args.experiment_data_dir, f'pretrained.pth')

        best_acc = 0
        for epoch in range(args.pretrain_epochs):
            train(args, model, device, train_loader, criterion, optimizer, epoch)
            scheduler.step()
            acc = test(args, model, device, criterion, test_loader)
            if acc > best_acc:
                best_acc = acc
                state_dict = model.state_dict()

        model.load_state_dict(state_dict)
        torch.save(state_dict, args.pretrained_model_dir)
        print(f'Model saved to {args.pretrained_model_dir}')
    else:
        state_dict = torch.load(args.pretrained_model_dir)
        model.load_state_dict(state_dict)
        best_acc = test(args, model, device, criterion, test_loader)

    dummy_input = torch.randn([1000, 1, 28, 28]).to(device)
    time_cost = get_model_time_cost(model, dummy_input)

    # 125.49 M, 0.85M, 93.29, 1.1012
    print(f'Pretrained model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}')

    # Step2. Model Pruning
    config_list = [{
        'sparsity': args.sparsity,
        'op_types': ['Conv2d']
    }]

    kw_args = {}
    if args.dependency_aware:
        dummy_input = torch.randn([1000, 1, 28, 28]).to(device)
        print('Enable the dependency_aware mode')
        # note that, not all pruners support the dependency_aware mode
        kw_args['dependency_aware'] = True
        kw_args['dummy_input'] = dummy_input

    pruner = L1FilterPruner(model, config_list, **kw_args)
    model = pruner.compress()
    pruner.get_pruned_weights()

    mask_path = os.path.join(args.experiment_data_dir, 'mask.pth')
    model_path = os.path.join(args.experiment_data_dir, 'pruned.pth')
    pruner.export_model(model_path=model_path, mask_path=mask_path)
    pruner._unwrap_model()  # unwrap all modules to normal state

    # Step3. Model Speedup
    m_speedup = ModelSpeedup(model, dummy_input, mask_path, device)
    m_speedup.speedup_model()
    print('model after speedup', model)

    flops, params, _ = count_flops_params(model, dummy_input, verbose=False)
    acc = test(args, model, device, criterion, test_loader)
    time_cost = get_model_time_cost(model, dummy_input)
    print(f'Pruned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {acc: .2f}, Time Cost: {time_cost}')

    # Step4. Model Finetuning
    optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

    best_acc = 0
    for epoch in range(args.finetune_epochs):
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        acc = test(args, model, device, criterion, test_loader)
        if acc > best_acc:
            best_acc = acc
            state_dict = model.state_dict()

    model.load_state_dict(state_dict)
    save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth')
    torch.save(state_dict, save_path)

    flops, params, _ = count_flops_params(model, dummy_input, verbose=True)
    time_cost = get_model_time_cost(model, dummy_input)

    # FLOPs 28.48 M, #Params: 0.18M, Accuracy:  89.03, Time Cost: 1.03
    print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}')
    print(f'Model saved to {save_path}')

    # Step5. Model Quantization via QAT
    config_list = [{
        'quant_types': ['weight', 'output'],
        'quant_bits': {'weight': 8, 'output': 8},
        'op_names': ['conv1']
    }, {
        'quant_types': ['output'],
        'quant_bits': {'output':8},
        'op_names': ['relu1']
    }, {
        'quant_types': ['weight', 'output'],
        'quant_bits': {'weight': 8, 'output': 8},
        'op_names': ['conv2']
    }, {
        'quant_types': ['output'],
        'quant_bits': {'output': 8},
        'op_names': ['relu2']
    }]

    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    quantizer = QAT_Quantizer(model, config_list, optimizer)
    quantizer.compress()

    # Step6. Quantization Aware Training
    best_acc = 0
    for epoch in range(1):
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        acc = test(args, model, device, criterion, test_loader)
        if acc > best_acc:
            best_acc = acc
            state_dict = model.state_dict()

    calibration_path = os.path.join(args.experiment_data_dir, 'calibration.pth')
    calibration_config = quantizer.export_model(model_path, calibration_path)
    print("calibration_config: ", calibration_config)

    # Step7. Model Speedup
    batch_size = 32
    input_shape = (batch_size, 1, 28, 28)
    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
    engine.compress()

    test_trt(engine, test_loader)
コード例 #14
0
ファイル: basic_pruners_torch.py プロジェクト: yinfupai/nni
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(args.experiment_data_dir, exist_ok=True)

    # prepare model and data
    train_loader, test_loader, criterion = get_data(args.dataset,
                                                    args.data_dir,
                                                    args.batch_size,
                                                    args.test_batch_size)

    model, optimizer, _ = get_model_optimizer_scheduler(
        args, device, train_loader, test_loader, criterion)

    dummy_input = get_dummy_input(args, device)
    flops, params, _ = count_flops_params(model, dummy_input)
    print(f"FLOPs: {flops}, params: {params}")

    print(f'start {args.pruner} pruning...')

    def trainer(model, optimizer, criterion, epoch):
        return train(args,
                     model,
                     device,
                     train_loader,
                     criterion,
                     optimizer,
                     epoch=epoch)

    pruner_cls = str2pruner[args.pruner]

    kw_args = {}
    config_list = [{'sparsity': args.sparsity, 'op_types': ['Conv2d']}]

    if args.pruner == 'level':
        config_list = [{'sparsity': args.sparsity, 'op_types': ['default']}]

    else:
        if args.global_sort:
            print('Enable the global_sort mode')
            # only taylor pruner supports global sort mode currently
            kw_args['global_sort'] = True
        if args.dependency_aware:
            dummy_input = get_dummy_input(args, device)
            print('Enable the dependency_aware mode')
            # note that, not all pruners support the dependency_aware mode
            kw_args['dependency_aware'] = True
            kw_args['dummy_input'] = dummy_input
        if args.pruner not in ('l1filter', 'l2filter', 'fpgm'):
            # set only work for training aware pruners
            kw_args['trainer'] = trainer
            kw_args['optimizer'] = optimizer
            kw_args['criterion'] = criterion

        if args.pruner in ('mean_activation', 'apoz', 'taylorfo'):
            kw_args['sparsifying_training_batches'] = 1

        if args.pruner == 'slim':
            kw_args['sparsifying_training_epochs'] = 1

        if args.pruner == 'agp':
            kw_args['pruning_algorithm'] = 'l1'
            kw_args['num_iterations'] = 2
            kw_args['epochs_per_iteration'] = 1

        # Reproduced result in paper 'PRUNING FILTERS FOR EFFICIENT CONVNETS',
        # Conv_1, Conv_8, Conv_9, Conv_10, Conv_11, Conv_12 are pruned with 50% sparsity, as 'VGG-16-pruned-A'
        # If you want to skip some layer, you can use 'exclude' like follow.
        if args.pruner == 'slim':
            config_list = [{
                'sparsity': args.sparsity,
                'op_types': ['BatchNorm2d'],
            }]
        elif args.model == 'resnet18':
            config_list = [{
                'sparsity': args.sparsity,
                'op_types': ['Conv2d']
            }, {
                'exclude': True,
                'op_names': ['layer1.0.conv1', 'layer1.0.conv2']
            }]
        else:
            config_list = [{
                'sparsity':
                args.sparsity,
                'op_types': ['Conv2d'],
                'op_names': [
                    'feature.0', 'feature.24', 'feature.27', 'feature.30',
                    'feature.34', 'feature.37'
                ]
            }]

    pruner = pruner_cls(model, config_list, **kw_args)

    # Pruner.compress() returns the masked model
    model = pruner.compress()
    pruner.get_pruned_weights()

    # export the pruned model masks for model speedup
    model_path = os.path.join(
        args.experiment_data_dir,
        'pruned_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner))
    mask_path = os.path.join(
        args.experiment_data_dir,
        'mask_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner))
    pruner.export_model(model_path=model_path, mask_path=mask_path)

    if args.test_only:
        test(args, model, device, criterion, test_loader)

    if args.speedup:
        # Unwrap all modules to normal state
        pruner._unwrap_model()
        m_speedup = ModelSpeedup(model, dummy_input, mask_path, device)
        m_speedup.speedup_model()

    print('start finetuning...')

    # Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.01,
                                momentum=0.9,
                                weight_decay=5e-4)
    scheduler = MultiStepLR(optimizer,
                            milestones=[
                                int(args.pretrain_epochs * 0.5),
                                int(args.pretrain_epochs * 0.75)
                            ],
                            gamma=0.1)

    best_top1 = 0
    save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth')
    for epoch in range(args.fine_tune_epochs):
        print('# Epoch {} #'.format(epoch))
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        top1 = test(args, model, device, criterion, test_loader)
        if top1 > best_top1:
            best_top1 = top1
            torch.save(model.state_dict(), save_path)

    flops, params, results = count_flops_params(model, dummy_input)
    print(
        f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_top1: .2f}'
    )

    if args.nni:
        nni.report_final_result(best_top1)
コード例 #15
0
if __name__ == '__main__':
    # model = MobileNetV2(n_class=10).to(device)
    model = VGG().to(device)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.1,
                                momentum=0.9,
                                weight_decay=5e-4)
    scheduler = MultiStepLR(optimizer, milestones=[50, 75], gamma=0.1)
    criterion = torch.nn.CrossEntropyLoss()

    for i in range(100):
        trainer(model, optimizer, criterion, i)
    pre_best_acc = evaluator(model)

    dummy_input = torch.rand(10, 3, 32, 32).to(device)
    pre_flops, pre_params, _ = count_flops_params(model, dummy_input)

    config_list = [{
        'op_types': ['Conv2d'],
        'total_sparsity': 0.5,
        'max_sparsity_per_layer': 0.8
    }]

    # if you just want to keep the final result as the best result, you can pass evaluator as None.
    # or the result with the highest score (given by evaluator) will be the best result.
    ddpg_params = {
        'hidden1': 300,
        'hidden2': 300,
        'lr_c': 1e-3,
        'lr_a': 1e-4,
        'warmup': 100,