예제 #1
0
def run(config):
    strategy = get_distribution_strategy(config)
    if config.metrics_dump is not None:
        write_metrics(0, config.metrics_dump)

    model_fn, model_params = get_model(
        config.model,
        input_shape=config.get('input_info', {}).get('sample_size', None),
        num_classes=config.get('num_classes', 1000),
        pretrained=config.get('pretrained', False),
        weights=config.get('weights', None))

    builders = get_dataset_builders(config, strategy.num_replicas_in_sync)
    datasets = [builder.build() for builder in builders]

    train_builder, validation_builder = builders
    train_dataset, validation_dataset = datasets

    train_epochs = config.epochs
    train_steps = train_builder.steps_per_epoch
    validation_steps = validation_builder.steps_per_epoch

    with TFOriginalModelManager(model_fn, **model_params) as model:
        with strategy.scope():
            compression_ctrl, compress_model = create_compressed_model(
                model, config.nncf_config)
            compression_callbacks = create_compression_callbacks(
                compression_ctrl, log_dir=config.log_dir)

            scheduler = build_scheduler(config=config,
                                        steps_per_epoch=train_steps)
            optimizer = build_optimizer(config=config, scheduler=scheduler)

            loss_obj = tf.keras.losses.CategoricalCrossentropy(
                label_smoothing=0.1)

            metrics = [
                tf.keras.metrics.CategoricalAccuracy(name='acc@1'),
                tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='acc@5'),
                tfa.metrics.MeanMetricWrapper(loss_obj, name='ce_loss'),
                tfa.metrics.MeanMetricWrapper(compression_ctrl.loss,
                                              name='cr_loss')
            ]

            compress_model.add_loss(compression_ctrl.loss)
            compress_model.compile(optimizer=optimizer,
                                   loss=loss_obj,
                                   metrics=metrics,
                                   run_eagerly=config.get('eager_mode', False))

            compress_model.summary()

            initial_epoch = 0
            if config.ckpt_path is not None:
                initial_epoch = resume_from_checkpoint(
                    model=compress_model,
                    compression_ctrl=compression_ctrl,
                    ckpt_path=config.ckpt_path,
                    steps_per_epoch=train_steps,
                    config=config)
            else:
                logger.info('initialization...')
                compression_ctrl.initialize(dataset=train_dataset)

    callbacks = get_callbacks(model_checkpoint=True,
                              include_tensorboard=True,
                              track_lr=True,
                              write_model_weights=False,
                              initial_step=initial_epoch * train_steps,
                              model_dir=config.log_dir,
                              ckpt_dir=config.checkpoint_save_dir)

    callbacks.append(
        get_progress_bar(stateful_metrics=['loss'] +
                         [metric.name for metric in metrics]))
    callbacks.extend(compression_callbacks)

    validation_kwargs = {
        'validation_data': validation_dataset,
        'validation_steps': validation_steps,
        'validation_freq': 1,
    }

    if 'train' in config.mode:
        logger.info('training...')
        compress_model.fit(train_dataset,
                           epochs=train_epochs,
                           steps_per_epoch=train_steps,
                           initial_epoch=initial_epoch,
                           callbacks=callbacks,
                           **validation_kwargs)

    logger.info('evaluation...')
    print_statistics(compression_ctrl.statistics())
    results = compress_model.evaluate(
        validation_dataset,
        steps=validation_steps,
        callbacks=[
            get_progress_bar(stateful_metrics=['loss'] +
                             [metric.name for metric in metrics])
        ],
        verbose=1)

    if config.metrics_dump is not None:
        write_metrics(results[1], config.metrics_dump)

    if 'export' in config.mode:
        save_path, save_format = get_saving_parameters(config)
        compression_ctrl.export_model(save_path, save_format)
        logger.info('Saved to {}'.format(save_path))
예제 #2
0
def run(config):
    strategy = get_distribution_strategy(config)
    if config.metrics_dump is not None:
        write_metrics(0, config.metrics_dump)

    # Create dataset
    builders = get_dataset_builders(config, strategy.num_replicas_in_sync)
    datasets = [builder.build() for builder in builders]
    train_builder, test_builder = builders
    train_dataset, test_dataset = datasets
    train_dist_dataset = strategy.experimental_distribute_dataset(
        train_dataset)
    test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)

    # Training parameters
    epochs = config.epochs
    steps_per_epoch = train_builder.steps_per_epoch
    num_test_batches = test_builder.steps_per_epoch

    # Create model builder
    model_builder = get_model_builder(config)

    with TFOriginalModelManager(model_builder.build_model,
                                weights=config.get('weights', None)) as model:
        with strategy.scope():
            compression_ctrl, compress_model = create_compressed_model(
                model, config.nncf_config)

            scheduler = build_scheduler(config=config,
                                        steps_per_epoch=steps_per_epoch)

            optimizer = build_optimizer(config=config, scheduler=scheduler)

            eval_metric = model_builder.eval_metrics()
            loss_fn = model_builder.build_loss_fn(compress_model,
                                                  compression_ctrl.loss)
            predict_post_process_fn = model_builder.post_processing

            checkpoint = tf.train.Checkpoint(model=compress_model,
                                             optimizer=optimizer)
            checkpoint_manager = tf.train.CheckpointManager(
                checkpoint, config.checkpoint_save_dir, max_to_keep=None)

            initial_epoch = initial_step = 0
            if config.ckpt_path:
                initial_epoch, initial_step = resume_from_checkpoint(
                    checkpoint_manager, compression_ctrl, config.ckpt_path,
                    steps_per_epoch, config)
            else:
                logger.info('Initialization...')
                compression_ctrl.initialize(dataset=train_dataset)

    train_step = create_train_step_fn(strategy, compress_model, loss_fn,
                                      optimizer)
    test_step = create_test_step_fn(strategy, compress_model,
                                    predict_post_process_fn)

    if 'train' in config.mode:
        train(train_step, test_step, eval_metric, train_dist_dataset,
              test_dist_dataset, initial_epoch, initial_step, epochs,
              steps_per_epoch, checkpoint_manager, compression_ctrl,
              config.log_dir, optimizer, num_test_batches, config.print_freq)

    print_statistics(compression_ctrl.statistics())
    metric_result = evaluate(test_step, eval_metric, test_dist_dataset,
                             num_test_batches, config.print_freq)
    logger.info('Validation metric = {}'.format(metric_result))

    if config.metrics_dump is not None:
        write_metrics(metric_result['AP'], config.metrics_dump)

    if 'export' in config.mode:
        save_path, save_format = get_saving_parameters(config)
        compression_ctrl.export_model(save_path, save_format)
        logger.info("Saved to {}".format(save_path))
예제 #3
0
def run_train(config):
    strategy = get_distribution_strategy(config)

    # Create dataset
    builders = get_dataset_builders(config, strategy.num_replicas_in_sync)

    datasets = [builder.build() for builder in builders]
    train_builder, _ = builders
    train_dataset, calibration_dataset = datasets
    train_dist_dataset = strategy.experimental_distribute_dataset(
        train_dataset)

    # Training parameters
    epochs = config.epochs
    steps_per_epoch = train_builder.steps_per_epoch

    # We use `model_batch_size` to create input layer for model
    config.model_batch_size = train_builder.batch_size

    # Create model builder
    model_builder = get_model_builder(config)

    with TFOriginalModelManager(model_builder.build_model,
                                weights=config.get('weights', None),
                                is_training=True) as model:
        with strategy.scope():
            compression_ctrl, compress_model = create_compressed_model(
                model, config.nncf_config)

            scheduler = build_scheduler(config=config,
                                        steps_per_epoch=steps_per_epoch)

            optimizer = build_optimizer(config=config, scheduler=scheduler)

            loss_fn = model_builder.build_loss_fn(compress_model,
                                                  compression_ctrl.loss)

            variables = get_variables(compress_model)
            checkpoint = tf.train.Checkpoint(variables=variables,
                                             optimizer=optimizer,
                                             step=tf.Variable(0))
            checkpoint_manager = tf.train.CheckpointManager(
                checkpoint, config.checkpoint_save_dir, max_to_keep=None)

            initial_epoch = initial_step = 0
            if config.ckpt_path:
                initial_epoch, initial_step = resume_from_checkpoint(
                    checkpoint_manager, compression_ctrl, config.ckpt_path,
                    steps_per_epoch, config)
            else:
                logger.info('Initialization...')
                compression_ctrl.initialize(dataset=calibration_dataset)

    train_step = create_train_step_fn(strategy, compress_model, loss_fn,
                                      optimizer)

    train(train_step, train_dist_dataset, initial_epoch, initial_step, epochs,
          steps_per_epoch, checkpoint_manager, compression_ctrl,
          config.log_dir, optimizer, config.print_freq)

    logger.info('Compression statistics')
    print_statistics(compression_ctrl.statistics())
예제 #4
0
def train(train_step, test_step, eval_metric, train_dist_dataset,
          test_dist_dataset, initial_epoch, initial_step, epochs,
          steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir,
          optimizer, num_test_batches, print_freq):

    train_summary_writer = SummaryWriter(log_dir, 'train')
    validation_summary_writer = SummaryWriter(log_dir, 'validation')
    compression_summary_writer = SummaryWriter(log_dir, 'compression')

    timer = Timer()
    timer.tic()

    logger.info('Training...')
    for epoch in range(initial_epoch, epochs):
        logger.info('Epoch: {}/{}'.format(epoch, epochs))
        compression_ctrl.scheduler.epoch_step(epoch)

        for step, x in enumerate(train_dist_dataset):
            if epoch == initial_epoch and step < initial_step % steps_per_epoch:
                continue
            if step == steps_per_epoch:
                save_path = checkpoint_manager.save()
                logger.info('Saved checkpoint for epoch={}: {}'.format(
                    epoch, save_path))
                break

            compression_ctrl.scheduler.step()
            train_loss = train_step(x)
            train_metric_result = tf.nest.map_structure(
                lambda s: s.numpy().astype(float), train_loss)

            if np.isnan(train_metric_result['total_loss']):
                raise ValueError('total loss is NaN')

            train_metric_result.update(
                {'learning_rate': optimizer.lr(optimizer.iterations).numpy()})

            train_summary_writer(metrics=train_metric_result,
                                 step=optimizer.iterations.numpy())

            if step % print_freq == 0:
                time = timer.toc(average=False)
                logger.info('Step: {}/{} Time: {:.3f} sec'.format(
                    step, steps_per_epoch, time))
                logger.info('Training metric = {}'.format(train_metric_result))
                timer.tic()

        test_metric_result = evaluate(test_step, eval_metric,
                                      test_dist_dataset, num_test_batches,
                                      print_freq)
        validation_summary_writer(metrics=test_metric_result,
                                  step=optimizer.iterations.numpy())
        eval_metric.reset_states()
        logger.info('Validation metric = {}'.format(test_metric_result))

        statistics = compression_ctrl.statistics()
        print_statistics(statistics)
        statistics = {
            'compression/statistics/' + key: value
            for key, value in statistics.items()
            if isinstance(value, (int, float))
        }
        compression_summary_writer(metrics=statistics,
                                   step=optimizer.iterations.numpy())

    train_summary_writer.close()
    validation_summary_writer.close()
    compression_summary_writer.close()