Esempio n. 1
0
    def step(self, action: float, model: Module):
        _, _, current_statistics = count_flops_params(model,
                                                      self.dummy_input,
                                                      verbose=False)
        current_statistics = {
            result['name']: result
            for result in current_statistics
        }
        index = self.pruning_op_names.index(self.current_op_name)
        action = 1 - current_statistics[self.current_op_name][
            self.target] / self.current_op_target

        total_current_target = sum([
            current_statistics[name][self.target]
            for name in self.pruning_op_names
        ])
        previous_pruning_target = self.under_pruning_target - total_current_target
        rest_target = sum([
            current_statistics[name][self.target]
            for name in self.pruning_op_names[index + 1:]
        ])

        self.layer_embedding[index][
            -3] = previous_pruning_target / self.under_pruning_target  # reduced
        self.layer_embedding[index][
            -2] = rest_target / self.under_pruning_target  # rest
        self.layer_embedding[index][-1] = action  # last action
        observation = self.layer_embedding[index, :].copy()

        return action, 0, observation, self.is_final_layer()
Esempio n. 2
0
def count_flops(model, log=None, device=None):
    dummy_input = torch.rand([1, 3, 256, 256])
    if device is not None:
        dummy_input = dummy_input.to(device)
    flops, params, results = count_flops_params(model, dummy_input)
    print(f"FLOPs: {flops}, params: {params}")
    if log is not None:
        log.write(f"FLOPs: {flops}, params: {params}\n")
    return flops, params
Esempio n. 3
0
    def __init__(self,
                 model: Module,
                 config_list: List[Dict],
                 dummy_input: Tensor,
                 total_sparsity: float,
                 max_sparsity_per_layer: Dict[str, float],
                 target: str = 'flops'):
        pruning_op_names = []
        [
            pruning_op_names.extend(config['op_names'])
            for config in config_list_canonical(model, config_list)
        ]
        self.pruning_ops = OrderedDict()
        self.pruning_types = []
        for i, (name, layer) in enumerate(model.named_modules()):
            if name in pruning_op_names:
                op_type = type(layer).__name__
                stride = np.power(np.prod(layer.stride), 1 /
                                  len(layer.stride)) if hasattr(
                                      layer, 'stride') else 0
                kernel_size = np.power(np.prod(layer.kernel_size), 1 /
                                       len(layer.kernel_size)) if hasattr(
                                           layer, 'kernel_size') else 1
                self.pruning_ops[name] = (i, op_type, stride, kernel_size)
                self.pruning_types.append(op_type)
        self.pruning_types = list(set(self.pruning_types))
        self.pruning_op_names = list(self.pruning_ops.keys())
        self.dummy_input = dummy_input

        self.total_sparsity = total_sparsity
        self.max_sparsity_per_layer = max_sparsity_per_layer
        assert target in ['flops', 'params']
        self.target = target

        self.origin_target, self.origin_params_num, self.origin_statistics = count_flops_params(
            model, dummy_input, verbose=False)
        self.origin_statistics = {
            result['name']: result
            for result in self.origin_statistics
        }

        self.under_pruning_target = sum([
            self.origin_statistics[name][self.target]
            for name in self.pruning_op_names
        ])
        self.excepted_pruning_target = self.total_sparsity * self.under_pruning_target
Esempio n. 4
0
    def correct_action(self, action: float, model: Module):
        try:
            op_name = next(self.ops_iter)
            index = self.pruning_op_names.index(op_name)
            _, _, current_statistics = count_flops_params(model,
                                                          self.dummy_input,
                                                          verbose=False)
            current_statistics = {
                result['name']: result
                for result in current_statistics
            }

            total_current_target = sum([
                current_statistics[name][self.target]
                for name in self.pruning_op_names
            ])
            previous_pruning_target = self.under_pruning_target - total_current_target
            max_rest_pruning_target = sum([
                current_statistics[name][self.target] *
                self.max_sparsity_per_layer[name]
                for name in self.pruning_op_names[index + 1:]
            ])
            min_current_pruning_target = self.excepted_pruning_target - previous_pruning_target - max_rest_pruning_target
            max_current_pruning_target_1 = self.origin_statistics[op_name][
                self.target] * self.max_sparsity_per_layer[op_name] - (
                    self.origin_statistics[op_name][self.target] -
                    current_statistics[op_name][self.target])
            max_current_pruning_target_2 = self.excepted_pruning_target - previous_pruning_target
            max_current_pruning_target = min(max_current_pruning_target_1,
                                             max_current_pruning_target_2)
            min_action = min_current_pruning_target / current_statistics[
                op_name][self.target]
            max_action = max_current_pruning_target / current_statistics[
                op_name][self.target]
            if min_action > self.max_sparsity_per_layer[op_name]:
                _logger.warning(
                    '[%s] min action > max sparsity per layer: %f > %f',
                    op_name, min_action, self.max_sparsity_per_layer[op_name])
            action = max(0., min(max_action, max(min_action, action)))

            self.current_op_name = op_name
            self.current_op_target = current_statistics[op_name][self.target]
        except StopIteration:
            raise Error('Something goes wrong, this should not happen.')
        return action
Esempio n. 5
0
    def _calculate_flops(self, eps=0.001):
        """FLOPs cost."""
        flops_lut = [{} for i in range(self.cnt_layers)]
        layer_id = 0

        for stage_name in self.lut_ops:
            stage_ops = self.lut_ops[stage_name]
            ops_num = self.layer_num[stage_name]

            for _ in range(ops_num):
                for op_name in stage_ops:
                    layer_config = self.layer_configs[layer_id]
                    key_params = {"fm_size": layer_config[3]}
                    op = stage_ops[op_name](*layer_config[0:3], **key_params)

                    # measured in Flops
                    in_shape = self.layer_in_shapes[layer_id]
                    x = (1, in_shape[0], in_shape[1], in_shape[2])
                    flops, _, _ = count_flops_params(op, x, verbose=False)
                    flops = eps if flops == 0.0 else flops
                    flops_lut[layer_id][op_name] = float(flops)
                layer_id += 1

        return flops_lut
Esempio n. 6
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(args.experiment_data_dir, exist_ok=True)

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=True, download=True, transform=transform),
        batch_size=64,
    )
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        'data', train=False, transform=transform),
                                              batch_size=1000)

    # Step1. Model Pretraining
    model = NaiveModel().to(device)
    criterion = torch.nn.NLLLoss()
    optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
    flops, params, _ = count_flops_params(model, (1, 1, 28, 28), verbose=False)

    if args.pretrained_model_dir is None:
        args.pretrained_model_dir = os.path.join(args.experiment_data_dir,
                                                 f'pretrained.pth')

        best_acc = 0
        for epoch in range(args.pretrain_epochs):
            train(args, model, device, train_loader, criterion, optimizer,
                  epoch)
            scheduler.step()
            acc = test(args, model, device, criterion, test_loader)
            if acc > best_acc:
                best_acc = acc
                state_dict = model.state_dict()

        model.load_state_dict(state_dict)
        torch.save(state_dict, args.pretrained_model_dir)
        print(f'Model saved to {args.pretrained_model_dir}')
    else:
        state_dict = torch.load(args.pretrained_model_dir)
        model.load_state_dict(state_dict)
        best_acc = test(args, model, device, criterion, test_loader)

    dummy_input = torch.randn([1000, 1, 28, 28]).to(device)
    time_cost = get_model_time_cost(model, dummy_input)

    # 125.49 M, 0.85M, 93.29, 1.1012
    print(
        f'Pretrained model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}'
    )

    # Step2. Model Pruning
    config_list = [{'sparsity': args.sparsity, 'op_types': ['Conv2d']}]

    kw_args = {}
    if args.dependency_aware:
        dummy_input = torch.randn([1000, 1, 28, 28]).to(device)
        print('Enable the dependency_aware mode')
        # note that, not all pruners support the dependency_aware mode
        kw_args['dependency_aware'] = True
        kw_args['dummy_input'] = dummy_input

    pruner = L1FilterPruner(model, config_list, **kw_args)
    model = pruner.compress()
    pruner.get_pruned_weights()

    mask_path = os.path.join(args.experiment_data_dir, 'mask.pth')
    model_path = os.path.join(args.experiment_data_dir, 'pruned.pth')
    pruner.export_model(model_path=model_path, mask_path=mask_path)
    pruner._unwrap_model()  # unwrap all modules to normal state

    # Step3. Model Speedup
    m_speedup = ModelSpeedup(model, dummy_input, mask_path, device)
    m_speedup.speedup_model()
    print('model after speedup', model)

    flops, params, _ = count_flops_params(model, dummy_input, verbose=False)
    acc = test(args, model, device, criterion, test_loader)
    time_cost = get_model_time_cost(model, dummy_input)
    print(
        f'Pruned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {acc: .2f}, Time Cost: {time_cost}'
    )

    # Step4. Model Finetuning
    optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

    best_acc = 0
    for epoch in range(args.finetune_epochs):
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        acc = test(args, model, device, criterion, test_loader)
        if acc > best_acc:
            best_acc = acc
            state_dict = model.state_dict()

    model.load_state_dict(state_dict)
    save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth')
    torch.save(state_dict, save_path)

    flops, params, _ = count_flops_params(model, dummy_input, verbose=True)
    time_cost = get_model_time_cost(model, dummy_input)

    # FLOPs 28.48 M, #Params: 0.18M, Accuracy:  89.03, Time Cost: 1.03
    print(
        f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}'
    )
    print(f'Model saved to {save_path}')

    # Step5. Model Quantization via QAT
    config_list = [{
        'quant_types': ['weight', 'output'],
        'quant_bits': {
            'weight': 8,
            'output': 8
        },
        'op_names': ['conv1']
    }, {
        'quant_types': ['output'],
        'quant_bits': {
            'output': 8
        },
        'op_names': ['relu1']
    }, {
        'quant_types': ['weight', 'output'],
        'quant_bits': {
            'weight': 8,
            'output': 8
        },
        'op_names': ['conv2']
    }, {
        'quant_types': ['output'],
        'quant_bits': {
            'output': 8
        },
        'op_names': ['relu2']
    }]

    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    quantizer = QAT_Quantizer(model, config_list, optimizer)
    quantizer.compress()

    # Step6. Quantization Aware Training
    best_acc = 0
    for epoch in range(1):
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        acc = test(args, model, device, criterion, test_loader)
        if acc > best_acc:
            best_acc = acc
            state_dict = model.state_dict()

    calibration_path = os.path.join(args.experiment_data_dir,
                                    'calibration.pth')
    calibration_config = quantizer.export_model(model_path, calibration_path)
    print("calibration_config: ", calibration_config)

    # Step7. Model Speedup
    batch_size = 32
    input_shape = (batch_size, 1, 28, 28)
    engine = ModelSpeedupTensorRT(model,
                                  input_shape,
                                  config=calibration_config,
                                  batchsize=32)
    engine.compress()

    test_trt(engine, test_loader)
Esempio n. 7
0
        model, total_epoch=args.pretrain_epochs)
    criterion = torch.nn.CrossEntropyLoss()
    pre_best_acc = 0.0
    best_state_dict = None

    for i in range(args.pretrain_epochs):
        trainer(model, optimizer, criterion)
        scheduler.step()
        acc = evaluator(model)
        if acc > pre_best_acc:
            pre_best_acc = acc
            best_state_dict = model.state_dict()
    print("Best accuracy: {}".format(pre_best_acc))
    model.load_state_dict(best_state_dict)
    pre_flops, pre_params, _ = count_flops_params(
        model,
        torch.randn([128, 3, 32, 32]).to(device))
    g_epoch = 0

    # Start to prune and speedup
    print('\n' + '=' * 50 +
          ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
    config_list = [{
        'total_sparsity': 0.5,
        'op_types': ['Conv2d'],
    }]

    # make sure you have used nni.trace to wrap the optimizer class before initialize
    traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(),
                                                  lr=0.01,
                                                  momentum=0.9,
Esempio n. 8
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(args.experiment_data_dir, exist_ok=True)

    # prepare model and data
    train_loader, test_loader, criterion = get_data(args.dataset,
                                                    args.data_dir,
                                                    args.batch_size,
                                                    args.test_batch_size)

    model, optimizer, scheduler = get_model_optimizer_scheduler(
        args, device, train_loader, test_loader, criterion)

    dummy_input = get_dummy_input(args, device)
    flops, params, results = count_flops_params(model, dummy_input)
    print(f"FLOPs: {flops}, params: {params}")

    print('start pruning...')
    model_path = os.path.join(
        args.experiment_data_dir,
        'pruned_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner))
    mask_path = os.path.join(
        args.experiment_data_dir,
        'mask_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner))

    pruner = get_pruner(model, args.pruner, device, optimizer,
                        args.dependency_aware)
    model = pruner.compress()

    if args.multi_gpu and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    if args.test_only:
        test(args, model, device, criterion, test_loader)

    best_top1 = 0
    for epoch in range(args.fine_tune_epochs):
        pruner.update_epoch(epoch)
        print('# Epoch {} #'.format(epoch))
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        top1 = test(args, model, device, criterion, test_loader)
        if top1 > best_top1:
            best_top1 = top1
            # Export the best model, 'model_path' stores state_dict of the pruned model,
            # mask_path stores mask_dict of the pruned model
            pruner.export_model(model_path=model_path, mask_path=mask_path)

    if args.nni:
        nni.report_final_result(best_top1)

    if args.speed_up:
        # reload the best checkpoint for speed-up
        args.pretrained_model_dir = model_path
        model, _, _ = get_model_optimizer_scheduler(args, device, train_loader,
                                                    test_loader, criterion)
        model.eval()

        apply_compression_results(model, mask_path, device)

        # test model speed
        start = time.time()
        for _ in range(32):
            use_mask_out = model(dummy_input)
        print('elapsed time when use mask: ', time.time() - start)

        m_speedup = ModelSpeedup(model, dummy_input, mask_path, device)
        m_speedup.speedup_model()

        flops, params, results = count_flops_params(model, dummy_input)
        print(f"FLOPs: {flops}, params: {params}")

        start = time.time()
        for _ in range(32):
            use_speedup_out = model(dummy_input)
        print('elapsed time when use speedup: ', time.time() - start)

        top1 = test(args, model, device, criterion, test_loader)
Esempio n. 9
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args = parse_args()

    #########################################################################
    # Prepare model, tokenizer, dataset, optimizer, and the scheduler
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()

    # Load dataset and tokenizer, and then preprocess the dataset
    raw_dataset, is_regression, num_labels = get_raw_dataset(args.task_name)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
    processed_datasets = preprocess(args, tokenizer, raw_dataset)
    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation_matched" if args.task_name ==
                                      "mnli" else "validation"]

    # Load pretrained model
    config = AutoConfig.from_pretrained(args.model_name,
                                        num_labels=num_labels,
                                        finetuning_task=args.task_name)
    model = AutoModelForSequenceClassification.from_pretrained(args.model_name,
                                                               config=config)
    model.to(device)

    #########################################################################
    # Finetune on the target GLUE task before pruning
    optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(
        args, tokenizer, model, train_dataset, eval_dataset)
    train_steps = args.num_train_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(name=args.lr_scheduler_type,
                                 optimizer=optimizer,
                                 num_warmup_steps=args.num_warmup_steps,
                                 num_training_steps=train_steps)
    metric = load_metric("glue", args.task_name)

    logger.info(
        "================= Finetuning before pruning =================")
    train_model(args, model, is_regression, train_dataloader, eval_dataloader,
                optimizer, lr_scheduler, metric, device)

    if args.output_dir is not None:
        torch.save(model.state_dict(),
                   args.output_dir + "/model_before_pruning.pt")

    if args.task_name == "mnli":
        final_eval_for_mnli(args, model, processed_datasets, metric,
                            data_collator)

    #########################################################################
    # Pruning
    optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(
        args, tokenizer, model, train_dataset, eval_dataset)
    dummy_input = next(iter(train_dataloader))["input_ids"].to(device)
    flops, params, results = count_flops_params(model, dummy_input)
    print(
        f"Initial model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M"
    )

    # Here criterion is embedded in the model. Upper levels can just pass None to trainer.
    def trainer(model, optimizer, criterion, epoch):
        return trainer_helper(model, train_dataloader, optimizer, device)

    def forward_runner(model):
        return forward_runner_helper(model, train_dataloader, device)

    # example: prune different layers with different sparsity
    attention_name_groups = list(
        zip([
            "bert.encoder.layer.{}.attention.self.query".format(i)
            for i in range(12)
        ], [
            "bert.encoder.layer.{}.attention.self.key".format(i)
            for i in range(12)
        ], [
            "bert.encoder.layer.{}.attention.self.value".format(i)
            for i in range(12)
        ], [
            "bert.encoder.layer.{}.attention.output.dense".format(i)
            for i in range(12)
        ]))

    kwargs = {
        "ranking_criterion": args.ranking_criterion,
        "global_sort": args.global_sort,
        "num_iterations": args.num_iterations,
        "epochs_per_iteration": args.epochs_per_iteration,
        "attention_name_groups": attention_name_groups,
        "head_hidden_dim": 64,
        "trainer": trainer,
        "optimizer": optimizer,
        "forward_runner": forward_runner
    }

    config_list = [{
        "sparsity":
        args.sparsity,
        "op_types": ["Linear"],
        "op_names": [x for layer in attention_name_groups[:6] for x in layer]
    }, {
        "sparsity":
        args.sparsity / 2,
        "op_types": ["Linear"],
        "op_names": [x for layer in attention_name_groups[6:] for x in layer]
    }]

    pruner = TransformerHeadPruner(model, config_list, **kwargs)
    pruner.compress()

    #########################################################################
    # uncomment the following part to export the pruned model masks
    # model_path = os.path.join(args.output_dir, "pruned_{}_{}.pth".format(args.model_name, args.task_name))
    # mask_path = os.path.join(args.output_dir, "mask_{}_{}.pth".format(args.model_name, args.task_name))
    # pruner.export_model(model_path=model_path, mask_path=mask_path)

    #########################################################################
    # Speedup
    # Currently, speeding up Transformers through NNI ModelSpeedup is not supported because of shape inference issues.
    # However, if you are using the transformers library, you can use the following workaround:
    # The following code gets the head pruning decisions from the pruner and calls the _prune_heads() function
    # implemented in models from the transformers library to speed up the model.
    if args.speed_up:
        speedup_rules = {}
        for group_idx, group in enumerate(pruner.attention_name_groups):
            # get the layer index
            layer_idx = None
            for part in group[0].split("."):
                try:
                    layer_idx = int(part)
                    break
                except:
                    continue
            if layer_idx is not None:
                speedup_rules[layer_idx] = pruner.pruned_heads[group_idx]
        pruner._unwrap_model()
        model.bert._prune_heads(speedup_rules)
        print(model)

    #########################################################################
    # After pruning, finetune again on the target task
    # Get the metric function
    metric = load_metric("glue", args.task_name)

    # re-initialize the optimizer and the scheduler
    optimizer, _, _, data_collator = get_dataloader_and_optimizer(
        args, tokenizer, model, train_dataset, eval_dataset)
    lr_scheduler = get_scheduler(name=args.lr_scheduler_type,
                                 optimizer=optimizer,
                                 num_warmup_steps=args.num_warmup_steps,
                                 num_training_steps=train_steps)

    logger.info("================= Finetuning after Pruning =================")
    train_model(args, model, is_regression, train_dataloader, eval_dataloader,
                optimizer, lr_scheduler, metric, device)

    if args.output_dir is not None:
        torch.save(model.state_dict(),
                   args.output_dir + "/model_after_pruning.pt")

    if args.task_name == "mnli":
        final_eval_for_mnli(args, model, processed_datasets, metric,
                            data_collator)

    flops, params, results = count_flops_params(model, dummy_input)
    print(
        f"Final model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")
Esempio n. 10
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(args.experiment_data_dir, exist_ok=True)

    # prepare model and data
    train_loader, test_loader, criterion = get_data(args.dataset,
                                                    args.data_dir,
                                                    args.batch_size,
                                                    args.test_batch_size)

    model, optimizer, scheduler = get_model_optimizer_scheduler(
        args, device, train_loader, test_loader, criterion)

    dummy_input = get_dummy_input(args, device)
    flops, params, results = count_flops_params(model, dummy_input)
    print(f"FLOPs: {flops}, params: {params}")

    print(f'start {args.pruner} pruning...')

    def trainer(model, optimizer, criterion, epoch):
        return train(args,
                     model,
                     device,
                     train_loader,
                     criterion,
                     optimizer,
                     epoch=epoch)

    pruner_cls = str2pruner[args.pruner]

    kw_args = {}
    config_list = [{'sparsity': args.sparsity, 'op_types': ['Conv2d']}]

    if args.pruner == 'level':
        config_list = [{'sparsity': args.sparsity, 'op_types': ['default']}]

    else:
        if args.dependency_aware:
            dummy_input = get_dummy_input(args, device)
            print('Enable the dependency_aware mode')
            # note that, not all pruners support the dependency_aware mode
            kw_args['dependency_aware'] = True
            kw_args['dummy_input'] = dummy_input
        if args.pruner not in ('l1filter', 'l2filter', 'fpgm'):
            # set only work for training aware pruners
            kw_args['trainer'] = trainer
            kw_args['optimizer'] = optimizer
            kw_args['criterion'] = criterion

        if args.pruner in ('mean_activation', 'apoz', 'taylorfo'):
            kw_args['sparsifying_training_batches'] = 1

        if args.pruner == 'slim':
            kw_args['sparsifying_training_epochs'] = 1

        if args.pruner == 'agp':
            kw_args['pruning_algorithm'] = 'l1'
            kw_args['num_iterations'] = 2
            kw_args['epochs_per_iteration'] = 1

        # Reproduced result in paper 'PRUNING FILTERS FOR EFFICIENT CONVNETS',
        # Conv_1, Conv_8, Conv_9, Conv_10, Conv_11, Conv_12 are pruned with 50% sparsity, as 'VGG-16-pruned-A'
        if args.pruner == 'slim':
            config_list = [{
                'sparsity': args.sparsity,
                'op_types': ['BatchNorm2d'],
            }]
        else:
            config_list = [{
                'sparsity':
                args.sparsity,
                'op_types': ['Conv2d'],
                'op_names': [
                    'feature.0', 'feature.24', 'feature.27', 'feature.30',
                    'feature.34', 'feature.37'
                ]
            }]

    pruner = pruner_cls(model, config_list, **kw_args)

    # Pruner.compress() returns the masked model
    model = pruner.compress()
    pruner.get_pruned_weights()

    # export the pruned model masks for model speedup
    model_path = os.path.join(
        args.experiment_data_dir,
        'pruned_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner))
    mask_path = os.path.join(
        args.experiment_data_dir,
        'mask_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner))
    pruner.export_model(model_path=model_path, mask_path=mask_path)

    if args.test_only:
        test(args, model, device, criterion, test_loader)

    if args.speed_up:
        # Unwrap all modules to normal state
        pruner._unwrap_model()
        m_speedup = ModelSpeedup(model, dummy_input, mask_path, device)
        m_speedup.speedup_model()

    print('start finetuning...')
    best_top1 = 0
    save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth')
    for epoch in range(args.fine_tune_epochs):
        print('# Epoch {} #'.format(epoch))
        train(args, model, device, train_loader, criterion, optimizer, epoch)
        scheduler.step()
        top1 = test(args, model, device, criterion, test_loader)
        if top1 > best_top1:
            best_top1 = top1
            torch.save(model.state_dict(), save_path)

    flops, params, results = count_flops_params(model, dummy_input)
    print(
        f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_top1: .2f}'
    )

    if args.nni:
        nni.report_final_result(best_top1)
Esempio n. 11
0
def main(args):
    # prepare dataset
    torch.manual_seed(0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader, val_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size)
    model, optimizer = get_trained_model_optimizer(args, device, train_loader, val_loader, criterion)

    def short_term_fine_tuner(model, epochs=1):
        for epoch in range(epochs):
            train(args, model, device, train_loader, criterion, optimizer, epoch)

    def trainer(model, optimizer, criterion, epoch, callback):
        return train(args, model, device, train_loader, criterion, optimizer, epoch=epoch, callback=callback)

    def evaluator(model):
        return test(model, device, criterion, val_loader)

    # used to save the performance of the original & pruned & finetuned models
    result = {'flops': {}, 'params': {}, 'performance':{}}

    flops, params, _ = count_flops_params(model, get_input_size(args.dataset))
    result['flops']['original'] = flops
    result['params']['original'] = params

    evaluation_result = evaluator(model)
    print('Evaluation result (original model): %s' % evaluation_result)
    result['performance']['original'] = evaluation_result

    # module types to prune, only "Conv2d" supported for channel pruning
    if args.base_algo in ['l1', 'l2', 'fpgm']:
        op_types = ['Conv2d']
    elif args.base_algo == 'level':
        op_types = ['default']

    config_list = [{
        'sparsity': args.sparsity,
        'op_types': op_types
    }]
    dummy_input = get_dummy_input(args, device)

    if args.pruner == 'L1FilterPruner':
        pruner = L1FilterPruner(model, config_list)
    elif args.pruner == 'L2FilterPruner':
        pruner = L2FilterPruner(model, config_list)
    elif args.pruner == 'FPGMPruner':
        pruner = FPGMPruner(model, config_list)
    elif args.pruner == 'NetAdaptPruner':
        pruner = NetAdaptPruner(model, config_list, short_term_fine_tuner=short_term_fine_tuner, evaluator=evaluator,
                                base_algo=args.base_algo, experiment_data_dir=args.experiment_data_dir)
    elif args.pruner == 'ADMMPruner':
        # users are free to change the config here
        if args.model == 'LeNet':
            if args.base_algo in ['l1', 'l2', 'fpgm']:
                config_list = [{
                    'sparsity': 0.8,
                    'op_types': ['Conv2d'],
                    'op_names': ['conv1']
                }, {
                    'sparsity': 0.92,
                    'op_types': ['Conv2d'],
                    'op_names': ['conv2']
                }]
            elif args.base_algo == 'level':
                config_list = [{
                    'sparsity': 0.8,
                    'op_names': ['conv1']
                }, {
                    'sparsity': 0.92,
                    'op_names': ['conv2']
                }, {
                    'sparsity': 0.991,
                    'op_names': ['fc1']
                }, {
                    'sparsity': 0.93,
                    'op_names': ['fc2']
                }]
        else:
            raise ValueError('Example only implemented for LeNet.')
        pruner = ADMMPruner(model, config_list, trainer=trainer, num_iterations=2, training_epochs=2)
    elif args.pruner == 'SimulatedAnnealingPruner':
        pruner = SimulatedAnnealingPruner(
            model, config_list, evaluator=evaluator, base_algo=args.base_algo,
            cool_down_rate=args.cool_down_rate, experiment_data_dir=args.experiment_data_dir)
    elif args.pruner == 'AutoCompressPruner':
        pruner = AutoCompressPruner(
            model, config_list, trainer=trainer, evaluator=evaluator, dummy_input=dummy_input,
            num_iterations=3, optimize_mode='maximize', base_algo=args.base_algo,
            cool_down_rate=args.cool_down_rate, admm_num_iterations=30, admm_training_epochs=5,
            experiment_data_dir=args.experiment_data_dir)
    else:
        raise ValueError(
            "Pruner not supported.")

    # Pruner.compress() returns the masked model
    # but for AutoCompressPruner, Pruner.compress() returns directly the pruned model
    model = pruner.compress()
    evaluation_result = evaluator(model)
    print('Evaluation result (masked model): %s' % evaluation_result)
    result['performance']['pruned'] = evaluation_result

    if args.save_model:
        pruner.export_model(
            os.path.join(args.experiment_data_dir, 'model_masked.pth'), os.path.join(args.experiment_data_dir, 'mask.pth'))
        print('Masked model saved to %s' % args.experiment_data_dir)

    # model speed up
    if args.speed_up:
        if args.pruner != 'AutoCompressPruner':
            if args.model == 'LeNet':
                model = LeNet().to(device)
            elif args.model == 'vgg16':
                model = VGG(depth=16).to(device)
            elif args.model == 'resnet18':
                model = ResNet18().to(device)
            elif args.model == 'resnet50':
                model = ResNet50().to(device)

            model.load_state_dict(torch.load(os.path.join(args.experiment_data_dir, 'model_masked.pth')))
            masks_file = os.path.join(args.experiment_data_dir, 'mask.pth')

            m_speedup = ModelSpeedup(model, dummy_input, masks_file, device)
            m_speedup.speedup_model()
            evaluation_result = evaluator(model)
            print('Evaluation result (speed up model): %s' % evaluation_result)
            result['performance']['speedup'] = evaluation_result

            torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_speed_up.pth'))
            print('Speed up model saved to %s' % args.experiment_data_dir)
        flops, params, _ = count_flops_params(model, get_input_size(args.dataset))
        result['flops']['speedup'] = flops
        result['params']['speedup'] = params

    if args.fine_tune:
        if args.dataset == 'mnist':
            optimizer = torch.optim.Adadelta(model.parameters(), lr=1)
            scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
        elif args.dataset == 'cifar10' and args.model == 'vgg16':
            optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
            scheduler = MultiStepLR(
                optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1)
        elif args.dataset == 'cifar10' and args.model == 'resnet18':
            optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
            scheduler = MultiStepLR(
                optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1)
        elif args.dataset == 'cifar10' and args.model == 'resnet50':
            optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
            scheduler = MultiStepLR(
                optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1)
        best_acc = 0
        for epoch in range(args.fine_tune_epochs):
            train(args, model, device, train_loader, criterion, optimizer, epoch)
            scheduler.step()
            acc = evaluator(model)
            if acc > best_acc:
                best_acc = acc
                torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_fine_tuned.pth'))

    print('Evaluation result (fine tuned): %s' % best_acc)
    print('Fined tuned model saved to %s' % args.experiment_data_dir)
    result['performance']['finetuned'] = best_acc

    with open(os.path.join(args.experiment_data_dir, 'result.json'), 'w+') as f:
        json.dump(result, f)
Esempio n. 12
0
    def generate_tasks(self, task_result: TaskResult) -> List[Task]:
        # append experience & update agent policy
        if task_result.task_id != 'origin':
            action, reward, observation, done = self.env.step(
                self.action, task_result.compact_model)
            self.T.append(
                [reward, self.observation, observation, self.action, done])
            self.observation = observation.copy()

            if done:
                final_reward = task_result.score - 1
                # agent observe and update policy
                for _, s_t, s_t1, a_t, d_t in self.T:
                    self.agent.observe(final_reward, s_t, s_t1, a_t, d_t)
                    if self.current_episode > self.warmup_episode:
                        self.agent.update_policy()

                self.current_episode += 1
                self.T = []
                self.action = None
                self.observation = None

            # update current2origin_sparsity in log file
            origin_model = torch.load(self._origin_model_path)
            compact_model = task_result.compact_model
            compact_model_masks = task_result.compact_model_masks
            current2origin_sparsity, _, _ = compute_sparsity(
                origin_model, compact_model, compact_model_masks,
                self.temp_config_list)
            self._tasks[task_result.task_id].state[
                'current2origin_sparsity'] = current2origin_sparsity
            current2origin_sparsity, _, _ = compute_sparsity(
                origin_model, compact_model, compact_model_masks,
                self.config_list_copy)
            self._tasks[task_result.task_id].state[
                'current_total_sparsity'] = current2origin_sparsity
            flops, params, _ = count_flops_params(compact_model,
                                                  self.dummy_input,
                                                  verbose=False)
            self._tasks[task_result.
                        task_id].state['current_flops'] = '{:.2f} M'.format(
                            flops / 1e6)
            self._tasks[task_result.
                        task_id].state['current_params'] = '{:.2f} M'.format(
                            params / 1e6)

        # generate new action
        if self.current_episode < self.total_episode:
            if self.observation is None:
                self.observation = self.env.reset().copy()
                self.temp_config_list = []
                compact_model = torch.load(self._origin_model_path)
                compact_model_masks = torch.load(self._origin_masks_path)
            else:
                compact_model = task_result.compact_model
                compact_model_masks = task_result.compact_model_masks
            if self.current_episode <= self.warmup_episode:
                action = self.agent.random_action()
            else:
                action = self.agent.select_action(self.observation,
                                                  episode=self.current_episode)
            action = action.tolist()[0]

            self.action = self.env.correct_action(action, compact_model)
            sub_config_list = [{
                'op_names': [self.env.current_op_name],
                'total_sparsity': self.action
            }]
            self.temp_config_list.extend(sub_config_list)

            task_id = self._task_id_candidate
            if self.env.is_first_layer() or self.env.is_final_layer():
                task_config_list = self.temp_config_list
            else:
                task_config_list = sub_config_list

            config_list_path = Path(self._intermediate_result_dir,
                                    '{}_config_list.json'.format(task_id))
            with Path(config_list_path).open('w') as f:
                json_tricks.dump(task_config_list, f, indent=4)

            model_path = Path(
                self._intermediate_result_dir,
                '{}_compact_model.pth'.format(task_result.task_id))
            masks_path = Path(
                self._intermediate_result_dir,
                '{}_compact_model_masks.pth'.format(task_result.task_id))
            torch.save(compact_model, model_path)
            torch.save(compact_model_masks, masks_path)

            task = Task(task_id, model_path, masks_path, config_list_path)
            if not self.env.is_final_layer():
                task.finetune = False
                task.evaluate = False

            self._tasks[task_id] = task
            self._task_id_candidate += 1
            return [task]
        else:
            return []
Esempio n. 13
0
if __name__ == '__main__':
    # model = MobileNetV2(n_class=10).to(device)
    model = VGG().to(device)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.1,
                                momentum=0.9,
                                weight_decay=5e-4)
    scheduler = MultiStepLR(optimizer, milestones=[50, 75], gamma=0.1)
    criterion = torch.nn.CrossEntropyLoss()

    for i in range(100):
        trainer(model, optimizer, criterion, i)
    pre_best_acc = evaluator(model)

    dummy_input = torch.rand(10, 3, 32, 32).to(device)
    pre_flops, pre_params, _ = count_flops_params(model, dummy_input)

    config_list = [{
        'op_types': ['Conv2d'],
        'total_sparsity': 0.5,
        'max_sparsity_per_layer': 0.8
    }]

    # if you just want to keep the final result as the best result, you can pass evaluator as None.
    # or the result with the highest score (given by evaluator) will be the best result.
    ddpg_params = {
        'hidden1': 300,
        'hidden2': 300,
        'lr_c': 1e-3,
        'lr_a': 1e-4,
        'warmup': 100,