def test_checkpoint_callback_make_checkpoints(save_freq): compression_ctrl, model = get_simple_compressed_model() compression_callbacks = create_compression_callbacks(compression_ctrl, log_tensorboard=False) dataset_len = 8 dummy_x = tf.random.normal((dataset_len,) + model.input_shape[1:]) dummy_y = tf.random.normal((dataset_len,) + model.output_shape[1:]) model.compile(loss=tf.losses.CategoricalCrossentropy()) ckpt_path = tempfile.mkdtemp() ckpt = tf.train.Checkpoint(model=model, compression_state=TFCompressionState(compression_ctrl)) model.fit(dummy_x, dummy_y, epochs=5, batch_size=2, callbacks=[CheckpointManagerCallback(ckpt, ckpt_path, save_freq), *compression_callbacks]) assert sorted(os.listdir(ckpt_path)) == REF_CKPT_DIR[save_freq] new_compression_ctrl, new_model = get_simple_compressed_model() new_ckpt = tf.train.Checkpoint(model=new_model, compression_state=TFCompressionState(new_compression_ctrl)) new_ckpt.restore(tf.train.latest_checkpoint(ckpt_path)) assert new_compression_ctrl.get_state() == compression_ctrl.get_state() assert tf.reduce_all([tf.reduce_all(w_new == w) for w_new, w in zip(new_model.weights, model.weights)]) shutil.rmtree(ckpt_path)
def test_checkpoint_callback_make_checkpoints(mocker, tmp_path): save_freq = 2 config = get_basic_quantization_config() gen_setup_spy = mocker.spy(QuantizationBuilder, '_get_quantizer_setup') model, compression_ctrl = create_compressed_model_and_algo_for_test( get_basic_conv_test_model(), config, force_no_init=True) assert isinstance(compression_ctrl, QuantizationController) quantizer_setup = gen_setup_spy.spy_return compression_callbacks = create_compression_callbacks(compression_ctrl, log_tensorboard=False) dataset_len = 8 dummy_x = tf.random.normal((dataset_len, ) + model.input_shape[1:]) dummy_y = tf.random.normal((dataset_len, ) + model.output_shape[1:]) model.compile(loss=tf.losses.CategoricalCrossentropy()) ckpt_path = tmp_path / 'checkpoint' ckpt = tf.train.Checkpoint( model=model, compression_state=TFCompressionState(compression_ctrl)) model.fit(dummy_x, dummy_y, epochs=5, batch_size=2, callbacks=[ CheckpointManagerCallback(ckpt, str(ckpt_path), save_freq), *compression_callbacks ]) assert sorted(os.listdir(ckpt_path)) == REF_CKPT_DIR[save_freq] new_compression_state = load_compression_state(ckpt_path) new_model, new_compression_ctrl = create_compressed_model_and_algo_for_test( get_basic_conv_test_model(), config, new_compression_state) new_model.compile(loss=tf.losses.CategoricalCrossentropy()) new_ckpt = tf.train.Checkpoint( model=new_model, compression_state=TFCompressionState(new_compression_ctrl)) load_checkpoint(new_ckpt, ckpt_path) builder = QuantizationBuilder(config) builder.load_state(new_compression_state['builder_state']) # pylint:disable=protected-access new_quantizer_setup = builder._quantizer_setup assert _quantization_setup_cmp(quantizer_setup, new_quantizer_setup) assert new_compression_ctrl.get_state() == compression_ctrl.get_state() assert tf.reduce_all([ tf.reduce_all(w_new == w) for w_new, w in zip(new_model.weights, model.weights) ])
def _save_and_load_compression_state(compression_ctrl, tmp_path): checkpoint_path = tmp_path / 'compression_state' checkpoint_to_save = tf.train.Checkpoint( compression_state=TFCompressionState(compression_ctrl)) checkpoint_to_save.save(checkpoint_path) compression_state = load_compression_state(str(checkpoint_path.parent)) return compression_state
def od_checkpoint_saver(config): """ Load object detection checkpoint and re-save it without optimizer (memory footprint is reduced). """ model_builder = get_model_od_builder(config) model = model_builder.build_model() compression_state = load_compression_state(config.ckpt_path) compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config, compression_state) checkpoint = tf.train.Checkpoint( model=compress_model, compression_state=TFCompressionState(compression_ctrl)) load_and_save_checkpoint(checkpoint, config)
def seg_checkpoint_saver(config): """ Load segmentation checkpoint and re-save it without optimizer (memory footprint is reduced). """ model_builder = get_model_seg_builder(config) model = model_builder.build_model() compression_state = load_compression_state(config.ckpt_path) compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config, compression_state) variables = get_variables(compress_model) checkpoint = tf.train.Checkpoint( variables=variables, compression_state=TFCompressionState(compression_ctrl), step=tf.Variable(0)) load_and_save_checkpoint(checkpoint, config)
def export(config): model_builder = get_model_builder(config) model = model_builder.build_model(weights=config.get('weights', None)) compression_state = None if config.ckpt_path: compression_state = load_compression_state(config.ckpt_path) compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config, compression_state) if config.ckpt_path: checkpoint = tf.train.Checkpoint( model=compress_model, compression_state=TFCompressionState(compression_ctrl)) load_checkpoint(checkpoint, config.ckpt_path) save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))
def restore_compressed_model(config, strategy, model_builder, ckpt_path = None): compression_state = None if ckpt_path: compression_state = load_compression_state(ckpt_path) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None), is_training=False) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model(model, config.nncf_config, compression_state) variables = get_variables(compress_model) checkpoint = tf.train.Checkpoint(variables=variables, compression_state=TFCompressionState(compression_ctrl), step=tf.Variable(0)) if ckpt_path: load_checkpoint(checkpoint, config.ckpt_path) return compression_ctrl, compress_model, checkpoint
def run_train(config): strategy = get_distribution_strategy(config) # Create dataset builders = get_dataset_builders(config, strategy.num_replicas_in_sync) datasets = [builder.build() for builder in builders] train_builder, _ = builders train_dataset, calibration_dataset = datasets train_dist_dataset = strategy.experimental_distribute_dataset( train_dataset) # Training parameters epochs = config.epochs steps_per_epoch = train_builder.steps_per_epoch # We use `model_batch_size` to create input layer for model config.model_batch_size = train_builder.batch_size # Create model builder model_builder = get_model_builder(config) # Register additional parameters in the NNCFConfig for initialization # the compressed model during building nncf_config = config.nncf_config nncf_config = register_default_init_args( nncf_config=nncf_config, data_loader=calibration_dataset, batch_size=train_builder.global_batch_size) resume_training = config.ckpt_path is not None compression_state = None if resume_training: compression_state = load_compression_state(config.ckpt_path) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None), is_training=True) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model( model, nncf_config, compression_state) scheduler = build_scheduler(config=config, steps_per_epoch=steps_per_epoch) optimizer = build_optimizer(config=config, scheduler=scheduler) loss_fn = model_builder.build_loss_fn(compress_model, compression_ctrl.loss) variables = get_variables(compress_model) checkpoint = tf.train.Checkpoint( variables=variables, optimizer=optimizer, compression_state=TFCompressionState(compression_ctrl), step=tf.Variable(0)) checkpoint_manager = tf.train.CheckpointManager( checkpoint, config.checkpoint_save_dir, max_to_keep=None) initial_epoch = initial_step = 0 if resume_training: initial_epoch, initial_step = resume_from_checkpoint( checkpoint_manager, config.ckpt_path, steps_per_epoch) statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer) train(train_step, train_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, config.print_freq) logger.info('Compression statistics') statistics = compression_ctrl.statistics() logger.info(statistics.to_str())
def run(config): strategy = get_distribution_strategy(config) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) # Create dataset train_builder, test_builder = get_dataset_builders( config, strategy.num_replicas_in_sync) train_dataset = train_builder.build() test_dataset = test_builder.build() train_dist_dataset = strategy.experimental_distribute_dataset( train_dataset) test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset) # Training parameters epochs = config.epochs steps_per_epoch = train_builder.steps_per_epoch num_test_batches = test_builder.steps_per_epoch # Create model builder model_builder = get_model_builder(config) def model_eval_fn(model): test_step = create_test_step_fn(strategy, model, model_builder.post_processing) metric_result = evaluate(test_step, model_builder.eval_metrics(), test_dist_dataset, num_test_batches, config.print_freq) return metric_result['AP'] # Register additional parameters in the NNCFConfig for initialization # the compressed model during building nncf_config = config.nncf_config nncf_config = register_default_init_args( nncf_config=nncf_config, data_loader=train_dataset, batch_size=train_builder.global_batch_size) resume_training = config.ckpt_path is not None compression_state = None if resume_training: compression_state = load_compression_state(config.ckpt_path) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None)) as model: with strategy.scope(): config.nncf_config.register_extra_structs( [ModelEvaluationArgs(eval_fn=model_eval_fn)]) compression_ctrl, compress_model = create_compressed_model( model, nncf_config, compression_state) scheduler = build_scheduler(config=config, steps_per_epoch=steps_per_epoch) optimizer = build_optimizer(config=config, scheduler=scheduler) eval_metric = model_builder.eval_metrics() loss_fn = model_builder.build_loss_fn(compress_model, compression_ctrl.loss) predict_post_process_fn = model_builder.post_processing checkpoint = tf.train.Checkpoint( model=compress_model, optimizer=optimizer, compression_state=TFCompressionState(compression_ctrl)) checkpoint_manager = tf.train.CheckpointManager( checkpoint, config.checkpoint_save_dir, max_to_keep=None) initial_epoch = initial_step = 0 if resume_training: initial_epoch, initial_step = resume_from_checkpoint( checkpoint_manager, config.ckpt_path, steps_per_epoch) train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer) test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn) if 'train' in config.mode: if is_accuracy_aware_training(config): train_summary_writer = SummaryWriter(config.log_dir, 'train') timer = Timer() timer.tic() def train_epoch_fn(compression_ctrl, model, epoch, **kwargs): train_step = create_train_step_fn(strategy, model, loss_fn, optimizer) train_epoch(train_step, compression_ctrl, epoch, initial_epoch, steps_per_epoch, optimizer, checkpoint_manager, train_dist_dataset, train_summary_writer, initial_step, config.print_freq, timer) def validate_fn(model, **kwargs): test_step = create_test_step_fn(strategy, model, predict_post_process_fn) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, config.print_freq) return metric_result['AP'] acc_aware_training_loop = create_accuracy_aware_training_loop( nncf_config, compression_ctrl) compress_model = acc_aware_training_loop.run( compress_model, train_epoch_fn=train_epoch_fn, validate_fn=validate_fn, tensorboard_writer=SummaryWriter(config.log_dir, 'accuracy_aware_training'), log_dir=config.log_dir) else: train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, num_test_batches, config.print_freq) statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, config.print_freq) logger.info('Validation metric = {}'.format(metric_result)) if config.metrics_dump is not None: write_metrics(metric_result['AP'], config.metrics_dump) if 'export' in config.mode: save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))