Example #1
0
def test_resume(config, tmp_path, multiprocessing_distributed,
                case_common_dirs):
    c = config

    checkpoint_save_dir = os.path.join(str(tmp_path), "models")
    config_factory = ConfigFactory(config['config'], tmp_path / 'config.json')
    ckpt_path = os.path.join(
        case_common_dirs["checkpoint_save_dir"],
        "distributed" if multiprocessing_distributed else "data_parallel",
        get_name(config_factory.config) + "_last.pth")
    if "max_iter" in config_factory.config:
        config_factory.config["max_iter"] += 2
    args = {
        "--mode": "train",
        "--data": c["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": c["batch_size"] * torch.cuda.device_count(),
        "--workers": 1,
        "--epochs": 2,
        "--checkpoint-save-dir": checkpoint_save_dir,
        "--resume": ckpt_path,
    }

    if multiprocessing_distributed:
        args["--multiprocessing-distributed"] = None

    runner = Command(create_command_line(args, c["sample_type"]))
    res = runner.run()
    assert res == 0
    assert os.path.exists(
        os.path.join(checkpoint_save_dir,
                     get_name(config_factory.config) + "_last.pth"))
Example #2
0
def test_resume(config, tmp_path, multiprocessing_distributed,
                case_common_dirs):
    checkpoint_save_dir = os.path.join(str(tmp_path), "models")
    config_factory = ConfigFactory(config['nncf_config'],
                                   tmp_path / 'config.json')
    ckpt_path = get_resuming_checkpoint_path(
        config_factory, multiprocessing_distributed,
        case_common_dirs["checkpoint_save_dir"])
    if "max_iter" in config_factory.config:
        config_factory.config["max_iter"] += 2
    args = {
        "--mode": "train",
        "--data": config["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": config["batch_size"] * torch.cuda.device_count(),
        "--workers":
        0,  # Workaround for the PyTorch MultiProcessingDataLoader issue
        "--epochs": 2,
        "--checkpoint-save-dir": checkpoint_save_dir,
        "--resume": ckpt_path,
    }

    if multiprocessing_distributed:
        args["--multiprocessing-distributed"] = None

    runner = Command(create_command_line(args, config["sample_type"]))
    res = runner.run()
    assert res == 0
    last_checkpoint_path = os.path.join(
        checkpoint_save_dir,
        get_name(config_factory.config) + "_last.pth")
    assert os.path.exists(last_checkpoint_path)
    assert torch.load(last_checkpoint_path)['compression_level'] in (
        CompressionLevel.FULL, CompressionLevel.PARTIAL)
Example #3
0
def test_trained_model_eval(config, tmp_path, multiprocessing_distributed,
                            case_common_dirs):
    config_factory = ConfigFactory(config['nncf_config'],
                                   tmp_path / 'config.json')
    ckpt_path = os.path.join(
        case_common_dirs["checkpoint_save_dir"],
        "distributed" if multiprocessing_distributed else "data_parallel",
        get_name(config_factory.config) + "_last.pth")
    args = {
        "--mode": "test",
        "--data": config["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": config["batch_size"] * torch.cuda.device_count(),
        "--workers":
        0,  # Workaround for the PyTorch MultiProcessingDataLoader issue
        "--weights": ckpt_path,
    }

    if multiprocessing_distributed:
        args["--multiprocessing-distributed"] = None

    runner = Command(create_command_line(args, config["sample_type"]))
    res = runner.run()
    assert res == 0
Example #4
0
def test_pretrained_model_train(config, tmp_path, multiprocessing_distributed,
                                case_common_dirs):
    checkpoint_save_dir = os.path.join(
        case_common_dirs["checkpoint_save_dir"],
        "distributed" if multiprocessing_distributed else "data_parallel")
    config_factory = ConfigFactory(config['nncf_config'],
                                   tmp_path / 'config.json')
    args = {
        "--mode": "train",
        "--data": config["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": config["batch_size"] * torch.cuda.device_count(),
        "--workers":
        0,  # Workaround for the PyTorch MultiProcessingDataLoader issue
        "--epochs": 2,
        "--checkpoint-save-dir": checkpoint_save_dir,
        "--dist-url": "tcp://127.0.0.1:8989"
    }

    if multiprocessing_distributed:
        args["--multiprocessing-distributed"] = None

    runner = Command(create_command_line(args, config["sample_type"]))
    runner.run()
    last_checkpoint_path = os.path.join(
        checkpoint_save_dir,
        get_name(config_factory.config) + "_last.pth")
    assert os.path.exists(last_checkpoint_path)
    assert torch.load(last_checkpoint_path)['compression_level'] in (
        CompressionLevel.FULL, CompressionLevel.PARTIAL)
def test_pretrained_model_train(config, tmp_path, multiprocessing_distributed):
    c = config

    checkpoint_save_dir = os.path.join(
        c["checkpoint_save_dir"],
        "distributed" if multiprocessing_distributed else "data_parallel")
    config_factory = ConfigFactory(config['config'], tmp_path / 'config.json')
    args = {
        "--mode": "train",
        "--data": c["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": c["batch_size"] * torch.cuda.device_count(),
        "--workers": 1,
        "--epochs": 1,
        "--checkpoint-save-dir": checkpoint_save_dir,
        "--dist-url": "tcp://127.0.0.1:8989"
    }

    if multiprocessing_distributed:
        args["--multiprocessing-distributed"] = None

    runner = Command(create_command_line(args, c["sample_type"]))
    res = runner.run()
    assert res == 0
    assert os.path.exists(
        os.path.join(checkpoint_save_dir,
                     get_name(config_factory.config) + "_last.pth"))
Example #6
0
def train(config, compression_algo, model, criterion, is_inception,
          lr_scheduler, model_name, optimizer, train_loader, train_sampler,
          val_loader):
    global best_acc1
    for epoch in range(config.start_epoch, config.epochs):
        config.cur_epoch = epoch
        if config.distributed:
            train_sampler.set_epoch(epoch)
        lr_scheduler.step(epoch if not isinstance(
            lr_scheduler, ReduceLROnPlateau) else best_acc1)

        # train for one epoch
        train_epoch(train_loader, model, criterion, optimizer,
                    compression_algo, epoch, config, is_inception)

        # compute compression algo statistics
        stats = compression_algo.statistics()

        acc1 = best_acc1
        if epoch % config.test_every_n_epochs == 0:
            # evaluate on validation set
            acc1, _ = validate(val_loader, model, criterion, config)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        # update compression scheduler state at the end of the epoch
        compression_algo.scheduler.epoch_step()

        if is_main_process():
            print_statistics(stats)

            checkpoint_path = osp.join(config.checkpoint_save_dir,
                                       get_name(config) + '_last.pth')
            checkpoint = {
                'epoch': epoch + 1,
                'arch': model_name,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
                'scheduler': compression_algo.scheduler.state_dict()
            }

            torch.save(checkpoint, checkpoint_path)
            make_additional_checkpoints(checkpoint_path, is_best, epoch + 1,
                                        config)

            for key, value in stats.items():
                if isinstance(value, (int, float)):
                    config.tb.add_scalar(
                        "compression/statistics/{0}".format(key), value,
                        len(train_loader) * epoch)
def test_trained_model_export(config, tmp_path, multiprocessing_distributed):
    c = config

    config_factory = ConfigFactory(config['config'], tmp_path / 'config.json')
    ckpt_path = os.path.join(
        c["checkpoint_save_dir"],
        "distributed" if multiprocessing_distributed else "data_parallel",
        get_name(config_factory.config) + "_last.pth")
    onnx_path = os.path.join(str(tmp_path), "model.onnx")
    args = {
        "--mode": "test",
        "--config": config_factory.serialize(),
        "--to-onnx": onnx_path,
        "--weights": ckpt_path
    }

    runner = Command(create_command_line(args, c["sample_type"]))
    res = runner.run()
    assert res == 0
    assert os.path.exists(onnx_path)
def train_epoch_end(config, compression_algo, net, epoch, iteration,
                    epoch_size, lr_scheduler, optimizer, test_data_loader,
                    best_mAp):
    is_best = False
    test_freq_in_epochs = max(config.test_interval // epoch_size, 1)
    compression_algo.scheduler.epoch_step(epoch)
    if not isinstance(lr_scheduler, ReduceLROnPlateau):
        lr_scheduler.step(epoch)
    if epoch % test_freq_in_epochs == 0 and iteration != 0:
        if is_on_first_rank(config):
            print_statistics(compression_algo.statistics())
        with torch.no_grad():
            net.eval()
            mAP = test_net(net,
                           config.device,
                           test_data_loader,
                           distributed=config.multiprocessing_distributed)
            if mAP > best_mAp:
                is_best = True
                best_mAp = mAP
            if config.metrics_dump is not None:
                write_metrics(mAP, config)
            if isinstance(lr_scheduler, ReduceLROnPlateau):
                lr_scheduler.step(mAP)
            net.train()
    if is_on_first_rank(config):
        checkpoint_file_path = osp.join(config.checkpoint_save_dir,
                                        "{}_last.pth".format(get_name(config)))
        torch.save(
            {
                'state_dict': net.state_dict(),
                'optimizer': optimizer.state_dict(),
                'iter': iteration,
                'scheduler': compression_algo.scheduler.state_dict()
            }, str(checkpoint_file_path))
        make_additional_checkpoints(checkpoint_file_path,
                                    is_best=is_best,
                                    epoch=epoch + 1,
                                    config=config)
    return best_mAp
Example #9
0
def get_resuming_checkpoint_path(config_factory, multiprocessing_distributed,
                                 checkpoint_save_dir):
    return os.path.join(
        checkpoint_save_dir,
        "distributed" if multiprocessing_distributed else "data_parallel",
        get_name(config_factory.config) + "_last.pth")
Example #10
0
                jconfig = json.load(config_path_.open())
                args_ = {
                    'data': dataset_path,
                    'weights': weights_path_,
                    'config': str(config_path_)
                }
                if batch_size:
                    args_['batch-size'] = batch_size
                if epochs:
                    args_['epochs'] = epochs
                test_config_ = {
                    'sample_type': sample_type_,
                    'expected_accuracy': expected_accuracy_,
                    'absolute_tolerance_train': absolute_tolerance_train_,
                    'absolute_tolerance_eval': absolute_tolerance_eval_,
                    'checkpoint_name': get_name(jconfig)
                }
                CONFIG_PARAMS.append(
                    tuple([test_config_, args_, execution_arg_,
                           dataset_name_]))


def get_config_name(config_path):
    base = os.path.basename(config_path)
    return os.path.splitext(base)[0]


@pytest.fixture(
    scope='module',
    params=CONFIG_PARAMS,
    ids=[
def train(net, compression_algo, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler):
    net.train()
    # loss counters
    loc_loss = 0  # epoch
    conf_loss = 0

    epoch_size = len(train_data_loader)
    print('Training ', config.model, ' on ', train_data_loader.dataset.name, ' dataset...')
    batch_iterator = None

    t_start = time.time()
    print_statistics(compression_algo.statistics())

    for iteration in range(config.start_iter, config['max_iter']):
        if (not batch_iterator) or (iteration % epoch_size == 0):
            # create batch iterator
            batch_iterator = iter(train_data_loader)

        epoch = iteration // epoch_size
        if iteration % epoch_size == 0:
            train_epoch_end(config, compression_algo, net, epoch, iteration, epoch_size, lr_scheduler, optimizer,
                            test_data_loader)

        compression_algo.scheduler.step(iteration - config.start_iter)

        optimizer.zero_grad()
        batch_iterator, batch_loss, batch_loss_c, batch_loss_l, loss_comp = train_step(
            batch_iterator, compression_algo, config, criterion, net, train_data_loader
        )
        optimizer.step()
        batch_loss_l = batch_loss_l / config.iter_size
        batch_loss_c = batch_loss_c / config.iter_size
        model_loss = (batch_loss_l + batch_loss_c) / config.iter_size
        batch_loss = batch_loss / config.iter_size

        loc_loss += batch_loss_l.item()
        conf_loss += batch_loss_c.item()

        ###########################
        # Logging
        ###########################

        if is_on_first_rank(config):
            config.tb.add_scalar("train/loss_l", batch_loss_l.item(), iteration)
            config.tb.add_scalar("train/loss_c", batch_loss_c.item(), iteration)
            config.tb.add_scalar("train/loss", batch_loss.item(), iteration)

            checkpoint_file_path = osp.join(config.checkpoint_save_dir, "{}_last.pth".format(get_name(config)))
            torch.save({
                'state_dict': net.state_dict(),
                'optimizer': optimizer.state_dict(),
                'iter': config['max_iter'],
                'scheduler': compression_algo.scheduler.state_dict()
            }, str(checkpoint_file_path))
            make_additional_checkpoints(checkpoint_file_path,
                                        is_best=True,
                                        epoch=epoch + 1,
                                        config=config)

        if iteration % config.print_freq == 0:
            t_finish = time.time()
            t_elapsed = t_finish - t_start
            t_start = time.time()
            print('{}: iter {} epoch {} || Loss: {:.4} || Time {:.4}s || lr: {} || CR loss: {}'.format(
                config.rank, iteration, epoch, model_loss.item(), t_elapsed, optimizer.param_groups[0]['lr'],
                loss_comp.item() if isinstance(loss_comp, torch.Tensor) else loss_comp
            ))
Example #12
0
def train(config,
          compression_ctrl,
          model,
          criterion,
          is_inception,
          lr_scheduler,
          model_name,
          optimizer,
          train_loader,
          train_sampler,
          val_loader,
          best_acc1=0):
    best_compression_level = CompressionLevel.NONE
    for epoch in range(config.start_epoch, config.epochs):
        config.cur_epoch = epoch
        if config.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train_epoch(train_loader, model, criterion, optimizer,
                    compression_ctrl, epoch, config, is_inception)

        # Learning rate scheduling should be applied after optimizer’s update
        lr_scheduler.step(epoch if not isinstance(
            lr_scheduler, ReduceLROnPlateau) else best_acc1)

        # update compression scheduler state at the end of the epoch
        compression_ctrl.scheduler.epoch_step()

        # compute compression algo statistics
        stats = compression_ctrl.statistics()

        acc1 = best_acc1
        if epoch % config.test_every_n_epochs == 0:
            # evaluate on validation set
            acc1, _ = validate(val_loader, model, criterion, config)

        compression_level = compression_ctrl.compression_level()
        # remember best acc@1, considering compression level. If current acc@1 less then the best acc@1, checkpoint
        # still can be best if current compression level is bigger then best one. Compression levels in ascending
        # order: NONE, PARTIAL, FULL.

        is_best_by_accuracy = acc1 > best_acc1 and compression_level == best_compression_level
        is_best = is_best_by_accuracy or compression_level > best_compression_level
        if is_best:
            best_acc1 = acc1
        best_compression_level = max(compression_level, best_compression_level)
        acc = best_acc1 / 100
        if config.metrics_dump is not None:
            write_metrics(acc, config.metrics_dump)
        if is_main_process():
            print_statistics(stats)

            checkpoint_path = osp.join(config.checkpoint_save_dir,
                                       get_name(config) + '_last.pth')
            checkpoint = {
                'epoch': epoch + 1,
                'arch': model_name,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'compression_level': compression_level,
                'acc1': acc1,
                'optimizer': optimizer.state_dict(),
                'scheduler': compression_ctrl.scheduler.state_dict()
            }

            torch.save(checkpoint, checkpoint_path)
            make_additional_checkpoints(checkpoint_path, is_best, epoch + 1,
                                        config)

            for key, value in stats.items():
                if isinstance(value, (int, float)):
                    config.tb.add_scalar(
                        "compression/statistics/{0}".format(key), value,
                        len(train_loader) * epoch)
Example #13
0
def train(net, compression_ctrl, train_data_loader, test_data_loader,
          criterion, optimizer, config, lr_scheduler):
    net.train()
    # loss counters
    loc_loss = 0  # epoch
    conf_loss = 0

    epoch_size = len(train_data_loader)
    logger.info('Training {} on {} dataset...'.format(
        config.model, train_data_loader.dataset.name))
    batch_iterator = None

    t_start = time.time()
    print_statistics(compression_ctrl.statistics())

    best_mAp = 0
    best_compression_level = CompressionLevel.NONE
    test_freq_in_epochs = max(config.test_interval // epoch_size, 1)

    for iteration in range(config.start_iter, config['max_iter']):
        if (not batch_iterator) or (iteration % epoch_size == 0):
            # create batch iterator
            batch_iterator = iter(train_data_loader)

        epoch = iteration // epoch_size

        if (iteration + 1) % epoch_size == 0:
            compression_ctrl.scheduler.epoch_step(epoch)
            compression_level = compression_ctrl.compression_level()
            is_best = False

            if (epoch + 1) % test_freq_in_epochs == 0:
                if is_on_first_rank(config):
                    print_statistics(compression_ctrl.statistics())
                with torch.no_grad():
                    net.eval()
                    mAP = test_net(
                        net,
                        config.device,
                        test_data_loader,
                        distributed=config.multiprocessing_distributed)
                    is_best_by_mAP = mAP > best_mAp and compression_level == best_compression_level
                    is_best = is_best_by_mAP or compression_level > best_compression_level
                    if is_best:
                        best_mAp = mAP
                    best_compression_level = max(compression_level,
                                                 best_compression_level)
                    net.train()

            # Learning rate scheduling should be applied after optimizer’s update
            if not isinstance(lr_scheduler, ReduceLROnPlateau):
                lr_scheduler.step(epoch)
            else:
                lr_scheduler.step(mAP)

            if is_on_first_rank(config):
                logger.info('Saving state, iter: {}'.format(iteration))

                checkpoint_file_path = osp.join(
                    config.checkpoint_save_dir,
                    "{}_last.pth".format(get_name(config)))
                torch.save(
                    {
                        'state_dict': net.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'iter': config['max_iter'],
                        'scheduler': compression_ctrl.scheduler.state_dict(),
                        'compression_level': compression_level,
                    }, str(checkpoint_file_path))
                make_additional_checkpoints(checkpoint_file_path,
                                            is_best=is_best,
                                            epoch=epoch + 1,
                                            config=config)

        compression_ctrl.scheduler.step(iteration - config.start_iter)

        optimizer.zero_grad()
        batch_iterator, batch_loss, batch_loss_c, batch_loss_l, loss_comp = train_step(
            batch_iterator, compression_ctrl, config, criterion, net,
            train_data_loader)
        optimizer.step()

        batch_loss_l = batch_loss_l / config.iter_size
        batch_loss_c = batch_loss_c / config.iter_size
        model_loss = (batch_loss_l + batch_loss_c) / config.iter_size
        batch_loss = batch_loss / config.iter_size

        loc_loss += batch_loss_l.item()
        conf_loss += batch_loss_c.item()

        ###########################
        # Logging
        ###########################

        if is_on_first_rank(config):
            config.tb.add_scalar("train/loss_l", batch_loss_l.item(),
                                 iteration)
            config.tb.add_scalar("train/loss_c", batch_loss_c.item(),
                                 iteration)
            config.tb.add_scalar("train/loss", batch_loss.item(), iteration)

        if iteration % config.print_freq == 0:
            t_finish = time.time()
            t_elapsed = t_finish - t_start
            t_start = time.time()
            logger.info(
                '{}: iter {} epoch {} || Loss: {:.4} || Time {:.4}s || lr: {} || CR loss: {}'
                .format(
                    config.rank, iteration, epoch, model_loss.item(),
                    t_elapsed, optimizer.param_groups[0]['lr'],
                    loss_comp.item()
                    if isinstance(loss_comp, torch.Tensor) else loss_comp))

    if config.metrics_dump is not None:
        write_metrics(best_mAp, config.metrics_dump)
def train_staged(config,
                 compression_ctrl,
                 model,
                 criterion,
                 is_inception,
                 optimizer_scheduler,
                 model_name,
                 optimizer,
                 train_loader,
                 train_sampler,
                 val_loader,
                 kd_loss_calculator,
                 batch_multiplier,
                 best_acc1=0):
    best_compression_level = CompressionLevel.NONE
    for epoch in range(config.start_epoch, config.epochs):
        config.cur_epoch = epoch
        if config.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train_epoch_staged(train_loader, batch_multiplier, model, criterion,
                           optimizer, optimizer_scheduler, kd_loss_calculator,
                           compression_ctrl, epoch, config, is_inception)

        # compute compression algo statistics
        stats = compression_ctrl.statistics()

        acc1 = best_acc1
        if epoch % config.test_every_n_epochs == 0:
            # evaluate on validation set
            acc1, _ = validate(val_loader, model, criterion, config)

        compression_level = compression_ctrl.compression_level()
        # remember best acc@1, considering compression level. If current acc@1 less then the best acc@1, checkpoint
        # still can be best if current compression level is bigger then best one. Compression levels in ascending
        # order: NONE, PARTIAL, FULL.
        is_best_by_accuracy = acc1 > best_acc1 and compression_level == best_compression_level
        is_best = is_best_by_accuracy or compression_level > best_compression_level
        best_acc1 = max(acc1, best_acc1)
        best_compression_level = max(compression_level, best_compression_level)

        # statistics (e.g. portion of the enabled quantizers) is related to the finished epoch,
        # hence printing should happen before epoch_step, which may inform about state of the next epoch (e.g. next
        # portion of enabled quantizers)
        if is_main_process():
            print_statistics(stats)

        # update compression scheduler state at the end of the epoch
        compression_ctrl.scheduler.epoch_step()
        optimizer_scheduler.epoch_step()

        if is_main_process():
            checkpoint_path = osp.join(config.checkpoint_save_dir,
                                       get_name(config) + '_last.pth')
            checkpoint = {
                'epoch':
                epoch + 1,
                'arch':
                model_name,
                'state_dict':
                model.state_dict(),
                'original_model_state_dict':
                kd_loss_calculator.original_model.state_dict(),
                'best_acc1':
                best_acc1,
                'compression_level':
                compression_level,
                'optimizer':
                optimizer.state_dict(),
                'compression_scheduler':
                compression_ctrl.scheduler.state_dict(),
                'optimizer_scheduler':
                optimizer_scheduler.state_dict()
            }

            torch.save(checkpoint, checkpoint_path)
            make_additional_checkpoints(checkpoint_path, is_best, epoch + 1,
                                        config)

            for key, value in stats.items():
                if isinstance(value, (int, float)):
                    config.tb.add_scalar(
                        "compression/statistics/{0}".format(key), value,
                        len(train_loader) * epoch)