def run(config): strategy = get_distribution_strategy(config) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) model_fn, model_params = get_model( config.model, input_shape=config.get('input_info', {}).get('sample_size', None), num_classes=config.get('num_classes', 1000), pretrained=config.get('pretrained', False), weights=config.get('weights', None)) builders = get_dataset_builders(config, strategy.num_replicas_in_sync) datasets = [builder.build() for builder in builders] train_builder, validation_builder = builders train_dataset, validation_dataset = datasets train_epochs = config.epochs train_steps = train_builder.steps_per_epoch validation_steps = validation_builder.steps_per_epoch with TFOriginalModelManager(model_fn, **model_params) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) compression_callbacks = create_compression_callbacks( compression_ctrl, log_dir=config.log_dir) scheduler = build_scheduler(config=config, steps_per_epoch=train_steps) optimizer = build_optimizer(config=config, scheduler=scheduler) loss_obj = tf.keras.losses.CategoricalCrossentropy( label_smoothing=0.1) metrics = [ tf.keras.metrics.CategoricalAccuracy(name='acc@1'), tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='acc@5'), tfa.metrics.MeanMetricWrapper(loss_obj, name='ce_loss'), tfa.metrics.MeanMetricWrapper(compression_ctrl.loss, name='cr_loss') ] compress_model.add_loss(compression_ctrl.loss) compress_model.compile(optimizer=optimizer, loss=loss_obj, metrics=metrics, run_eagerly=config.get('eager_mode', False)) compress_model.summary() initial_epoch = 0 if config.ckpt_path is not None: initial_epoch = resume_from_checkpoint( model=compress_model, compression_ctrl=compression_ctrl, ckpt_path=config.ckpt_path, steps_per_epoch=train_steps, config=config) else: logger.info('initialization...') compression_ctrl.initialize(dataset=train_dataset) callbacks = get_callbacks(model_checkpoint=True, include_tensorboard=True, track_lr=True, write_model_weights=False, initial_step=initial_epoch * train_steps, model_dir=config.log_dir, ckpt_dir=config.checkpoint_save_dir) callbacks.append( get_progress_bar(stateful_metrics=['loss'] + [metric.name for metric in metrics])) callbacks.extend(compression_callbacks) validation_kwargs = { 'validation_data': validation_dataset, 'validation_steps': validation_steps, 'validation_freq': 1, } if 'train' in config.mode: logger.info('training...') compress_model.fit(train_dataset, epochs=train_epochs, steps_per_epoch=train_steps, initial_epoch=initial_epoch, callbacks=callbacks, **validation_kwargs) logger.info('evaluation...') print_statistics(compression_ctrl.statistics()) results = compress_model.evaluate( validation_dataset, steps=validation_steps, callbacks=[ get_progress_bar(stateful_metrics=['loss'] + [metric.name for metric in metrics]) ], verbose=1) if config.metrics_dump is not None: write_metrics(results[1], config.metrics_dump) if 'export' in config.mode: save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info('Saved to {}'.format(save_path))
def run(config): strategy = get_distribution_strategy(config) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) # Create dataset builders = get_dataset_builders(config, strategy.num_replicas_in_sync) datasets = [builder.build() for builder in builders] train_builder, test_builder = builders train_dataset, test_dataset = datasets train_dist_dataset = strategy.experimental_distribute_dataset( train_dataset) test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset) # Training parameters epochs = config.epochs steps_per_epoch = train_builder.steps_per_epoch num_test_batches = test_builder.steps_per_epoch # Create model builder model_builder = get_model_builder(config) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None)) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) scheduler = build_scheduler(config=config, steps_per_epoch=steps_per_epoch) optimizer = build_optimizer(config=config, scheduler=scheduler) eval_metric = model_builder.eval_metrics() loss_fn = model_builder.build_loss_fn(compress_model, compression_ctrl.loss) predict_post_process_fn = model_builder.post_processing checkpoint = tf.train.Checkpoint(model=compress_model, optimizer=optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, config.checkpoint_save_dir, max_to_keep=None) initial_epoch = initial_step = 0 if config.ckpt_path: initial_epoch, initial_step = resume_from_checkpoint( checkpoint_manager, compression_ctrl, config.ckpt_path, steps_per_epoch, config) else: logger.info('Initialization...') compression_ctrl.initialize(dataset=train_dataset) train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer) test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn) if 'train' in config.mode: train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, num_test_batches, config.print_freq) print_statistics(compression_ctrl.statistics()) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, config.print_freq) logger.info('Validation metric = {}'.format(metric_result)) if config.metrics_dump is not None: write_metrics(metric_result['AP'], config.metrics_dump) if 'export' in config.mode: save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))
def run_train(config): strategy = get_distribution_strategy(config) # Create dataset builders = get_dataset_builders(config, strategy.num_replicas_in_sync) datasets = [builder.build() for builder in builders] train_builder, _ = builders train_dataset, calibration_dataset = datasets train_dist_dataset = strategy.experimental_distribute_dataset( train_dataset) # Training parameters epochs = config.epochs steps_per_epoch = train_builder.steps_per_epoch # We use `model_batch_size` to create input layer for model config.model_batch_size = train_builder.batch_size # Create model builder model_builder = get_model_builder(config) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None), is_training=True) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) scheduler = build_scheduler(config=config, steps_per_epoch=steps_per_epoch) optimizer = build_optimizer(config=config, scheduler=scheduler) loss_fn = model_builder.build_loss_fn(compress_model, compression_ctrl.loss) variables = get_variables(compress_model) checkpoint = tf.train.Checkpoint(variables=variables, optimizer=optimizer, step=tf.Variable(0)) checkpoint_manager = tf.train.CheckpointManager( checkpoint, config.checkpoint_save_dir, max_to_keep=None) initial_epoch = initial_step = 0 if config.ckpt_path: initial_epoch, initial_step = resume_from_checkpoint( checkpoint_manager, compression_ctrl, config.ckpt_path, steps_per_epoch, config) else: logger.info('Initialization...') compression_ctrl.initialize(dataset=calibration_dataset) train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer) train(train_step, train_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, config.print_freq) logger.info('Compression statistics') print_statistics(compression_ctrl.statistics())
def train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer, num_test_batches, print_freq): train_summary_writer = SummaryWriter(log_dir, 'train') validation_summary_writer = SummaryWriter(log_dir, 'validation') compression_summary_writer = SummaryWriter(log_dir, 'compression') timer = Timer() timer.tic() logger.info('Training...') for epoch in range(initial_epoch, epochs): logger.info('Epoch: {}/{}'.format(epoch, epochs)) compression_ctrl.scheduler.epoch_step(epoch) for step, x in enumerate(train_dist_dataset): if epoch == initial_epoch and step < initial_step % steps_per_epoch: continue if step == steps_per_epoch: save_path = checkpoint_manager.save() logger.info('Saved checkpoint for epoch={}: {}'.format( epoch, save_path)) break compression_ctrl.scheduler.step() train_loss = train_step(x) train_metric_result = tf.nest.map_structure( lambda s: s.numpy().astype(float), train_loss) if np.isnan(train_metric_result['total_loss']): raise ValueError('total loss is NaN') train_metric_result.update( {'learning_rate': optimizer.lr(optimizer.iterations).numpy()}) train_summary_writer(metrics=train_metric_result, step=optimizer.iterations.numpy()) if step % print_freq == 0: time = timer.toc(average=False) logger.info('Step: {}/{} Time: {:.3f} sec'.format( step, steps_per_epoch, time)) logger.info('Training metric = {}'.format(train_metric_result)) timer.tic() test_metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, print_freq) validation_summary_writer(metrics=test_metric_result, step=optimizer.iterations.numpy()) eval_metric.reset_states() logger.info('Validation metric = {}'.format(test_metric_result)) statistics = compression_ctrl.statistics() print_statistics(statistics) statistics = { 'compression/statistics/' + key: value for key, value in statistics.items() if isinstance(value, (int, float)) } compression_summary_writer(metrics=statistics, step=optimizer.iterations.numpy()) train_summary_writer.close() validation_summary_writer.close() compression_summary_writer.close()