def build_model(self, weights=None, is_training=None): outputs = self.model_outputs(self._input_layer, is_training) keras_model = tf.keras.models.Model(inputs=self._input_layer, outputs=outputs, name='yolo_v4') if weights: logger.info('Loaded pretrained weights from {}'.format(weights)) keras_model.load_weights(weights, by_name=True) return keras_model
def evaluate(test_step, metric, test_dist_dataset, num_batches, print_freq): """Runs evaluation steps and aggregate metrics""" timer = Timer() timer.tic() logger.info('Testing...') for batch_idx, x in enumerate(test_dist_dataset): labels, outputs = test_step(x) metric.update_state(labels, outputs) if batch_idx % print_freq == 0: time = timer.toc(average=False) logger.info('Predict for batch: {}/{} Time: {:.3f} sec'.format( batch_idx, num_batches, time)) timer.tic() logger.info('Total time: {:.3f} sec'.format(timer.total_time)) timer.reset() logger.info('Evaluating predictions...') timer.tic() result = metric.result() timer.toc(average=False) logger.info('Total time: {:.3f} sec'.format(timer.total_time)) return result
def train(train_step, train_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer, print_freq): train_summary_writer = SummaryWriter(log_dir, 'train') compression_summary_writer = SummaryWriter(log_dir, 'compression') timer = Timer() timer.tic() logger.info('Training...') for epoch in range(initial_epoch, epochs): logger.info('Epoch: {}/{}'.format(epoch, epochs)) compression_ctrl.scheduler.epoch_step(epoch) for step, x in enumerate(train_dist_dataset): if epoch == initial_epoch and step < initial_step % steps_per_epoch: continue checkpoint_manager.checkpoint.step.assign_add(1) if step == steps_per_epoch: save_path = checkpoint_manager.save() logger.info('Saved checkpoint for epoch={}: {}'.format( epoch, save_path)) break compression_ctrl.scheduler.step() train_loss = train_step(x) train_metric_result = tf.nest.map_structure( lambda s: s.numpy().astype(float), train_loss) if np.isnan(train_metric_result['total_loss']): raise ValueError('total loss is NaN') train_metric_result.update( {'learning_rate': optimizer.lr(optimizer.iterations).numpy()}) train_summary_writer(metrics=train_metric_result, step=optimizer.iterations.numpy()) if step % print_freq == 0: time = timer.toc(average=False) logger.info('Step: {}/{} Time: {:.3f} sec'.format( step, steps_per_epoch, time)) logger.info('Training metric = {}'.format(train_metric_result)) timer.tic() statistics = compression_ctrl.statistics() print_statistics(statistics) statistics = { 'compression/statistics/' + key: value for key, value in statistics.items() if isinstance(value, (int, float)) } compression_summary_writer(metrics=statistics, step=optimizer.iterations.numpy()) train_summary_writer.close() compression_summary_writer.close()
def export(config): model, model_params = get_model( config.model, input_shape=config.get('input_info', {}).get('sample_size', None), num_classes=config.get('num_classes', 1000), pretrained=config.get('pretrained', False), weights=config.get('weights', None)) model = model(**model_params) compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) metrics = [ tf.keras.metrics.CategoricalAccuracy(name='acc@1'), tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='acc@5') ] loss_obj = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1) compress_model.compile(loss=loss_obj, metrics=metrics) compress_model.summary() if config.ckpt_path is not None: load_checkpoint(model=compress_model, ckpt_path=config.ckpt_path) save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info('Saved to {}'.format(save_path))
def __init__(self, params): logger.info('FastrcnnBoxLoss huber_loss_delta {}'.format(params.huber_loss_delta)) # The delta is typically around the mean value of regression target. # for instances, the regression targets of 512x512 input with 6 anchors on # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2]. self._huber_loss = tf.keras.losses.Huber( delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
def resume_from_checkpoint(checkpoint_manager, compression_ctrl, ckpt_path, steps_per_epoch): if load_checkpoint(checkpoint_manager.checkpoint, ckpt_path) == 0: return 0 optimizer = checkpoint_manager.checkpoint.optimizer initial_step = optimizer.iterations.numpy() initial_epoch = initial_step // steps_per_epoch compression_ctrl.scheduler.load_state(initial_step, steps_per_epoch) logger.info('Resuming from epoch %d (global step %d)', initial_epoch, initial_step) return initial_epoch, initial_step
def build_scheduler(config, epoch_size, batch_size, steps): optimizer_config = config.get('optimizer', {}) schedule_type = optimizer_config.get('schedule_type', 'exponential').lower() schedule_params = optimizer_config.get("schedule_params", {}) if schedule_type == 'exponential': decay_rate = schedule_params.get('decay_rate', None) if decay_rate is None: raise ValueError('decay_rate parameter must be specified ' 'for the exponential scheduler') initial_lr = schedule_params.get('initial_lr', None) if initial_lr is None: raise ValueError('initial_lr parameter must be specified ' 'for the exponential scheduler') decay_epochs = schedule_params.get('decay_epochs', None) decay_steps = decay_epochs * steps if decay_epochs is not None else 0 logger.info( 'Using exponential learning rate with: ' 'initial_learning_rate: {initial_lr}, decay_steps: {decay_steps}, ' 'decay_rate: {decay_rate}'.format(initial_lr=initial_lr, decay_steps=decay_steps, decay_rate=decay_rate)) lr = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=initial_lr, decay_steps=decay_steps, decay_rate=decay_rate) elif schedule_type == 'piecewise_constant': boundaries = schedule_params.get('boundaries', None) if boundaries is None: raise ValueError('boundaries parameter must be specified ' 'for the piecewise_constant scheduler') values = schedule_params.get('values', None) if values is None: raise ValueError('values parameter must be specified ' 'for the piecewise_constant') logger.info( 'Using Piecewise constant decay with warmup. ' 'Parameters: batch_size: {batch_size}, epoch_size: {epoch_size}, ' 'boundaries: {boundaries}, values: {values}'.format( batch_size=batch_size, epoch_size=epoch_size, boundaries=boundaries, values=values)) steps_per_epoch = epoch_size // batch_size boundaries = [steps_per_epoch * x for x in boundaries] lr = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries, values) elif schedule_type == 'step': lr = StepLearningRateWithLinearWarmup(steps, schedule_params) return lr
def resume_from_checkpoint(model, compression_ctrl, ckpt_path, steps_per_epoch): if load_checkpoint(model, ckpt_path) == 0: return 0 initial_step = model.optimizer.iterations.numpy() initial_epoch = initial_step // steps_per_epoch compression_ctrl.scheduler.load_state(initial_step, steps_per_epoch) logger.info('Resuming from epoch %d', initial_epoch) return initial_epoch
def _load_tfrecords(self): logger.info('Using TFRecords to load {} data.'.format(self._split)) dataset_key = self._dataset_name.replace('/', '') if dataset_key in self._tfrecord_datasets: self._dataset_loader = self._tfrecord_datasets[dataset_key]( config=self._config, is_train=self._is_train) else: raise ValueError('Unknown dataset name: {}'.format( self._dataset_name)) dataset = self._dataset_loader.as_dataset() return dataset
def export(config): model_builder = get_model_builder(config) model = model_builder.build_model(weights=config.get('weights', None)) compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) if config.ckpt_path: checkpoint = tf.train.Checkpoint(model=compress_model) load_checkpoint(checkpoint, config.ckpt_path) save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))
def export(config): model_builder = get_model_builder(config) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None), is_training=False) as model: compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) if config.ckpt_path: variables = get_variables(compress_model) checkpoint = tf.train.Checkpoint(variables=variables) load_checkpoint(checkpoint, config.ckpt_path) save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))
def build_model(self, weights=None, is_training=None): with keras_utils.maybe_enter_backend_graph(): outputs = self.model_outputs(self._input_layer, is_training) keras_model = tf.keras.models.Model(inputs=self._input_layer, outputs=outputs, name='retinanet') if self._checkpoint_path: logger.info('Init backbone') init_checkpoint_fn = self.make_restore_checkpoint_fn() init_checkpoint_fn(keras_model) if weights: logger.info('Loaded pretrained weights from {}'.format(weights)) keras_model.load_weights(weights) return keras_model
def checkpoint_saver(config): """ Load checkpoint and re-save it without optimizer (memory footprint is reduced) """ model_builder = get_model_builder(config) model = model_builder.build_model() _, compress_model = create_compressed_model(model, config.nncf_config) checkpoint = tf.train.Checkpoint(model=compress_model) load_checkpoint(checkpoint, config.ckpt_path) checkpoint_manager = tf.train.CheckpointManager(checkpoint, config.checkpoint_save_dir, max_to_keep=None) save_path = checkpoint_manager.save() logger.info('Saved checkpoint: {}'.format(save_path))
def build_model(self, weights=None, is_training=None): input_layers = self.build_input_layers(self._params, is_training) with keras_utils.maybe_enter_backend_graph(): outputs = self.model_outputs(input_layers, is_training) keras_model = tf.keras.models.Model(inputs=input_layers, outputs=outputs, name='maskrcnn') if self._checkpoint_path: logger.info('Init backbone') init_checkpoint_fn = self.make_restore_checkpoint_fn() init_checkpoint_fn(keras_model) if weights: logger.info('Loaded pretrained weights from {}'.format(weights)) _restore_baseline_weights(keras_model, weights) return keras_model
def run_train(config): strategy = get_distribution_strategy(config) # Create dataset builders = get_dataset_builders(config, strategy) datasets = [builder.build() for builder in builders] train_builder, _ = builders train_dataset, calibration_dataset = datasets train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset) # Training parameters epochs = config.epochs steps_per_epoch = train_builder.steps_per_epoch # We use `model_batch_size` to create input layer for model config.model_batch_size = train_builder.batch_size # Create model builder model_builder = get_model_builder(config) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None), is_training=True) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model(model, config.nncf_config) scheduler = build_scheduler( config=config, epoch_size=train_builder.num_examples, batch_size=train_builder.global_batch_size, steps=steps_per_epoch) optimizer = build_optimizer( config=config, scheduler=scheduler) loss_fn = model_builder.build_loss_fn() variables = get_variables(compress_model) checkpoint = tf.train.Checkpoint(variables=variables, optimizer=optimizer, step=tf.Variable(0)) checkpoint_manager = tf.train.CheckpointManager(checkpoint, config.checkpoint_save_dir, max_to_keep=None) initial_epoch = initial_step = 0 if config.ckpt_path: initial_epoch, initial_step = resume_from_checkpoint(checkpoint_manager, compression_ctrl, config.ckpt_path, steps_per_epoch) else: logger.info('Initialization...') compression_ctrl.initialize(dataset=calibration_dataset) train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer) logger.info('Training...') train(train_step, train_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer) logger.info('Compression statistics') print_statistics(compression_ctrl.statistics())
def _restore_checkpoint_fn(keras_model): """Loads pretrained model through scaffold function.""" if not checkpoint_path: logger.info('checkpoint_path is empty') return var_prefix = prefix if prefix and not prefix.endswith('/'): var_prefix += '/' var_to_shape_map = _get_checkpoint_map(checkpoint_path) assert var_to_shape_map, 'var_to_shape_map should not be empty' vars_to_load = _build_assignment_map(keras_model, prefix=var_prefix, skip_variables_regex=skip_regex, var_to_shape_map=var_to_shape_map) if not vars_to_load: raise ValueError('Variables to load is empty.') tf.compat.v1.train.init_from_checkpoint(checkpoint_path, vars_to_load)
def _load_tfds(self): logger.info('Using TFDS to load {} data.'.format(self._split)) set_hard_limit_num_open_files() self._dataset_loader = tfds.builder(self._dataset_name, data_dir=self._dataset_dir) self._dataset_loader.download_and_prepare() decoders = {'image': tfds.decode.SkipDecoding()} \ if self._skip_decoding else None read_config = tfds.ReadConfig(interleave_cycle_length=64, interleave_block_length=1) dataset = self._dataset_loader.as_dataset( split=self._split, as_supervised=self._as_supervised, shuffle_files=True, decoders=decoders, read_config=read_config) return dataset
def run(config): strategy = get_distribution_strategy(config) model_fn, model_params = get_model( config.model, input_shape=config.get('input_info', {}).get('sample_size', None), num_classes=config.get('num_classes', 1000), pretrained=config.get('pretrained', False), weights=config.get('weights', None)) builders = get_dataset_builders(config, strategy) datasets = [builder.build() for builder in builders] train_builder, validation_builder = builders train_dataset, validation_dataset = datasets train_epochs = config.epochs train_steps = train_builder.steps_per_epoch validation_steps = validation_builder.steps_per_epoch with TFOriginalModelManager(model_fn, **model_params) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) compression_callbacks = create_compression_callbacks( compression_ctrl, log_dir=config.log_dir) scheduler = build_scheduler( config=config, epoch_size=train_builder.num_examples, batch_size=train_builder.global_batch_size, steps=train_steps) optimizer = build_optimizer(config=config, scheduler=scheduler) metrics = [ tf.keras.metrics.CategoricalAccuracy(name='acc@1'), tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='acc@5') ] loss_obj = tf.keras.losses.CategoricalCrossentropy( label_smoothing=0.1) compress_model.compile(optimizer=optimizer, loss=loss_obj, metrics=metrics, run_eagerly=config.get('eager_mode', False)) compress_model.summary() initial_epoch = 0 if config.ckpt_path is not None: initial_epoch = resume_from_checkpoint( model=compress_model, compression_ctrl=compression_ctrl, ckpt_path=config.ckpt_path, steps_per_epoch=train_steps) else: logger.info('initialization...') compression_ctrl.initialize(dataset=train_dataset) callbacks = get_callbacks(model_checkpoint=True, include_tensorboard=True, track_lr=True, write_model_weights=False, initial_step=initial_epoch * train_steps, model_dir=config.log_dir, ckpt_dir=config.checkpoint_save_dir) callbacks.extend(compression_callbacks) validation_kwargs = { 'validation_data': validation_dataset, 'validation_steps': validation_steps, 'validation_freq': 1, } if 'train' in config.mode: logger.info('training...') compress_model.fit(train_dataset, epochs=train_epochs, steps_per_epoch=train_steps, initial_epoch=initial_epoch, callbacks=callbacks, **validation_kwargs) logger.info('evaluation...') print_statistics(compression_ctrl.statistics()) compress_model.evaluate(validation_dataset, steps=validation_steps, verbose=1) if 'export' in config.mode: save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info('Saved to {}'.format(save_path))
def run(config): strategy = get_distribution_strategy(config) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) # Create dataset builders = get_dataset_builders(config, strategy.num_replicas_in_sync) datasets = [builder.build() for builder in builders] train_builder, test_builder = builders train_dataset, test_dataset = datasets train_dist_dataset = strategy.experimental_distribute_dataset( train_dataset) test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset) # Training parameters epochs = config.epochs steps_per_epoch = train_builder.steps_per_epoch num_test_batches = test_builder.steps_per_epoch # Create model builder model_builder = get_model_builder(config) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None)) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) scheduler = build_scheduler(config=config, steps_per_epoch=steps_per_epoch) optimizer = build_optimizer(config=config, scheduler=scheduler) eval_metric = model_builder.eval_metrics() loss_fn = model_builder.build_loss_fn(compress_model, compression_ctrl.loss) predict_post_process_fn = model_builder.post_processing checkpoint = tf.train.Checkpoint(model=compress_model, optimizer=optimizer) checkpoint_manager = tf.train.CheckpointManager( checkpoint, config.checkpoint_save_dir, max_to_keep=None) initial_epoch = initial_step = 0 if config.ckpt_path: initial_epoch, initial_step = resume_from_checkpoint( checkpoint_manager, compression_ctrl, config.ckpt_path, steps_per_epoch, config) else: logger.info('Initialization...') compression_ctrl.initialize(dataset=train_dataset) train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer) test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn) if 'train' in config.mode: train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, num_test_batches, config.print_freq) print_statistics(compression_ctrl.statistics()) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, config.print_freq) logger.info('Validation metric = {}'.format(metric_result)) if config.metrics_dump is not None: write_metrics(metric_result['AP'], config.metrics_dump) if 'export' in config.mode: save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path))
def build_optimizer(config, scheduler): optimizer_config = config.get('optimizer', {}) optimizer_type = optimizer_config.get('type', 'adam').lower() optimizer_params = optimizer_config.get("optimizer_params", {}) logger.info('Building %s optimizer with params %s', optimizer_type, optimizer_params) if optimizer_type == 'sgd': logger.info('Using SGD optimizer') nesterov = optimizer_params.get('nesterov', False) optimizer = tf.keras.optimizers.SGD(learning_rate=scheduler, nesterov=nesterov) elif optimizer_type == 'momentum': logger.info('Using momentum optimizer') nesterov = optimizer_params.get('nesterov', False) momentum = optimizer_params.get('momentum', 0.9) optimizer = tf.keras.optimizers.SGD(learning_rate=scheduler, momentum=momentum, nesterov=nesterov) elif optimizer_type == 'rmsprop': logger.info('Using RMSProp') rho = optimizer_params.get('rho', 0.9) momentum = optimizer_params.get('momentum', 0.9) epsilon = optimizer_params.get('epsilon', 1e-07) optimizer = tf.keras.optimizers.RMSprop(learning_rate=scheduler, rho=rho, momentum=momentum, epsilon=epsilon) elif optimizer_type == 'adam': logger.info('Using Adam') beta_1 = optimizer_params.get('beta_1', 0.9) beta_2 = optimizer_params.get('beta_2', 0.999) epsilon = optimizer_params.get('epsilon', 1e-07) optimizer = tf.keras.optimizers.Adam(learning_rate=scheduler, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon) elif optimizer_type == 'adamw': logger.info('Using AdamW') weight_decay = optimizer_params.get('weight_decay', 0.01) beta_1 = optimizer_params.get('beta_1', 0.9) beta_2 = optimizer_params.get('beta_2', 0.999) epsilon = optimizer_params.get('epsilon', 1e-07) optimizer = tfa.optimizers.AdamW(weight_decay=weight_decay, learning_rate=scheduler, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon) else: raise ValueError('Unknown optimizer %s' % optimizer_type) moving_average_decay = optimizer_params.get('moving_average_decay', 0.) if moving_average_decay > 0.: logger.info('Including moving average decay.') optimizer = tfa.optimizers.MovingAverage( optimizer, average_decay=moving_average_decay, num_updates=None) if optimizer_params.get('lookahead', None): logger.info('Using lookahead optimizer.') optimizer = tfa.optimizers.Lookahead(optimizer) return optimizer
def build_scheduler(config, steps_per_epoch): optimizer_config = config.get('optimizer', {}) schedule_type = optimizer_config.get('schedule_type', 'step').lower() schedule_params = optimizer_config.get('schedule_params', {}) gamma = schedule_params.get('gamma', optimizer_config.get('gamma', 0.1)) base_lr = schedule_params.get('base_lr', optimizer_config.get('base_lr', None)) schedule_base_lr_check(schedule_type, base_lr) if schedule_type == 'exponential': step = schedule_params.get('step', optimizer_config.get('step', 1)) decay_steps = step * steps_per_epoch logger.info( 'Using exponential learning rate with: ' 'initial lr: %f, decay steps: %d, ' 'decay rate: %f', base_lr, decay_steps, gamma) lr = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=base_lr, decay_steps=decay_steps, decay_rate=gamma) elif schedule_type == 'piecewise_constant': boundaries = schedule_params.get( 'boundaries', optimizer_config.get('boundaries', None)) if boundaries is None: raise ValueError('`boundaries` parameter must be specified ' 'for the `piecewise_constant` scheduler') values = schedule_params.get('values', optimizer_config.get('values', None)) if values is None: raise ValueError('`values` parameter must be specified ' 'for the `piecewise_constant` scheduler') logger.info( 'Using Piecewise constant decay with warmup. ' 'Parameters: boundaries: %s, values: %s', boundaries, values) boundaries = [steps_per_epoch * x for x in boundaries] lr = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries, values) elif schedule_type == 'multistep': logger.info('Using MultiStep learning rate.') steps = schedule_params.get('steps', optimizer_config.get('steps', None)) if steps is None: raise ValueError('`steps` parameter must be specified ' 'for the `multistep` scheduler') steps = [steps_per_epoch * x for x in steps] lr = MultiStepLearningRate(base_lr, steps, gamma=gamma) elif schedule_type == 'step': step = schedule_params.get('step', optimizer_config.get('step', 1)) decay_steps = step * steps_per_epoch logger.info( 'Using Step learning rate with: ' 'base_lr: %f, decay steps: %d, ' 'gamma: %f', base_lr, decay_steps, gamma) lr = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=base_lr, decay_steps=decay_steps, decay_rate=gamma, staircase=True) elif schedule_type == 'step_warmup': lr = StepLearningRateWithLinearWarmup(schedule_params) elif schedule_type == 'cosine': decay_steps = steps_per_epoch * config.epochs logger.info( 'Using Cosine learning rate with: ' 'base_lr: %f, decay steps: %d, ', base_lr, decay_steps) lr = tf.keras.experimental.CosineDecay(initial_learning_rate=base_lr, decay_steps=decay_steps) else: raise KeyError( f'Unknown learning rate scheduler type: {schedule_type}') return lr
def load_checkpoint(model, ckpt_path): logger.info('Load from checkpoint is enabled.') if tf.io.gfile.isdir(ckpt_path): checkpoint = tf.train.latest_checkpoint(ckpt_path) logger.info('Latest checkpoint: {}'.format(checkpoint)) else: checkpoint = ckpt_path if tf.io.gfile.exists(ckpt_path + '.index') else None logger.info('Provided checkpoint: {}'.format(checkpoint)) if not checkpoint: logger.info('No checkpoint detected.') return 0 logger.info( 'Checkpoint file {} found and restoring from checkpoint'.format( checkpoint)) model.load_weights(checkpoint).expect_partial() logger.info('Completed loading from checkpoint.') return None
def _build_assignment_map(keras_model, prefix='', skip_variables_regex=None, var_to_shape_map=None): """Compute an assignment mapping for loading older checkpoints into a Keras model. Variable names are remapped from the original TPUEstimator model to the new Keras name. Args: keras_model: tf.keras.Model object to provide variables to assign. prefix: prefix in the variable name to be remove for alignment with names in the checkpoint. skip_variables_regex: regular expression to math the names of variables that do not need to be assign. var_to_shape_map: variable name to shape mapping from the checkpoint. Returns: The variable assignment map. """ assignment_map = {} checkpoint_names = None if var_to_shape_map: predicate = lambda x: not x.endswith('Momentum') and not x.endswith( 'global_step') checkpoint_names = list(filter(predicate, var_to_shape_map.keys())) for var in keras_model.variables: var_name = var.name if skip_variables_regex and re.match(skip_variables_regex, var_name): continue # Trim the index of the variable. if ':' in var_name: var_name = var_name[:var_name.rindex(':')] if var_name.startswith(prefix): var_name = var_name[len(prefix):] if not var_to_shape_map: assignment_map[var_name] = var continue # Match name with variables in the checkpoint. match_names = [] for x in checkpoint_names: if x.endswith(var_name): match_names.append(x) try: if match_names: assert len(match_names ) == 1, 'more then on matches for {}: {}'.format( var_name, match_names) checkpoint_names.remove(match_names[0]) assignment_map[match_names[0]] = var else: logger.info('Error not found var name: %s', var_name) except Exception as ex: logger.info('Error removing the match_name: %s', match_names) logger.info('Exception: %s', ex) raise logger.info('Found variable in checkpoint: %d', len(assignment_map)) return assignment_map
def run_evaluation(config, eval_timeout=None): """Runs evaluation on checkpoint save directory""" strategy = get_distribution_strategy(config) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) dataset_builder = get_dataset_builders(config, strategy.num_replicas_in_sync) dataset = dataset_builder.build() num_batches = dataset_builder.steps_per_epoch test_dist_dataset = strategy.experimental_distribute_dataset(dataset) # We use `model_batch_size` to create input layer for model config.model_batch_size = dataset_builder.batch_size model_builder = get_model_builder(config) with TFOriginalModelManager(model_builder.build_model, weights=config.get('weights', None), is_training=False) as model: with strategy.scope(): compression_ctrl, compress_model = create_compressed_model( model, config.nncf_config) variables = get_variables(compress_model) checkpoint = tf.train.Checkpoint(variables=variables, step=tf.Variable(0)) eval_metric = model_builder.eval_metrics() predict_post_process_fn = model_builder.post_processing test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn) if 'test' in config.mode: if config.ckpt_path: load_checkpoint(checkpoint, config.ckpt_path) statistics = compression_ctrl.statistics() print_statistics(statistics) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_batches, config.print_freq) eval_metric.reset_states() logger.info('Test metric = {}'.format(metric_result)) if 'export' in config.mode: save_path, save_format = get_saving_parameters(config) compression_ctrl.export_model(save_path, save_format) logger.info("Saved to {}".format(save_path)) elif 'train' in config.mode: validation_summary_writer = SummaryWriter(config.log_dir, 'validation') checkpoint_dir = config.checkpoint_save_dir eval_timeout = config.eval_timeout for checkpoint_path in tf.train.checkpoints_iterator( checkpoint_dir, timeout=eval_timeout): status = checkpoint.restore(checkpoint_path) status.expect_partial() logger.info( 'Checkpoint file {} found and restoring from checkpoint'. format(checkpoint_path)) logger.info('Checkpoint step: {}'.format(checkpoint.step.numpy())) metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_batches, config.print_freq) current_step = checkpoint.step.numpy() validation_summary_writer(metrics=metric_result, step=current_step) eval_metric.reset_states() logger.info('Validation metric = {}'.format(metric_result)) validation_summary_writer.close() if config.metrics_dump is not None: write_metrics(metric_result['AP'], config.metrics_dump)
def build_optimizer(config, scheduler): optimizer_config = config.get('optimizer', {}) optimizer_type = optimizer_config.get('type', 'adam').lower() optimizer_params = optimizer_config.get('optimizer_params', {}) logger.info('Building %s optimizer with params %s', optimizer_type, optimizer_params) if optimizer_type in ['sgd', 'momentum']: printable_names = {'sgd': 'SGD', 'momentum': 'momentum'} logger.info('Using %s optimizer', printable_names[optimizer_type]) default_momentum_value = 0.9 if optimizer_type == 'momentum' else 0.0 momentum = optimizer_params.get('momentum', default_momentum_value) nesterov = optimizer_params.get('nesterov', False) weight_decay = optimizer_config.get('weight_decay', None) common_params = { 'learning_rate': scheduler, 'nesterov': nesterov, 'momentum': momentum } if weight_decay: optimizer = tfa.optimizers.SGDW(**common_params, weight_decay=weight_decay) else: optimizer = tf.keras.optimizers.SGD(**common_params) elif optimizer_type == 'rmsprop': logger.info('Using RMSProp optimizer') rho = optimizer_params.get('rho', 0.9) momentum = optimizer_params.get('momentum', 0.9) epsilon = optimizer_params.get('epsilon', 1e-07) optimizer = tf.keras.optimizers.RMSprop(learning_rate=scheduler, rho=rho, momentum=momentum, epsilon=epsilon) elif optimizer_type in ['adam', 'adamw']: printable_names = {'adam': 'Adam', 'adamw': 'AdamW'} logger.info('Using %s optimizer', printable_names[optimizer_type]) beta_1, beta_2 = optimizer_params.get('betas', [0.9, 0.999]) epsilon = optimizer_params.get('eps', 1e-07) amsgrad = optimizer_params.get('amsgrad', False) w_decay_defaul_value = 0.01 if optimizer_type == 'adamw' else None weight_decay = optimizer_config.get('weight_decay', w_decay_defaul_value) common_params = { 'learning_rate': scheduler, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon, 'amsgrad': amsgrad } if weight_decay: optimizer = tfa.optimizers.AdamW(**common_params, weight_decay=weight_decay) else: optimizer = tf.keras.optimizers.Adam(**common_params) else: raise ValueError('Unknown optimizer %s' % optimizer_type) moving_average_decay = optimizer_params.get('moving_average_decay', 0.) if moving_average_decay > 0.: logger.info('Including moving average decay.') optimizer = tfa.optimizers.MovingAverage( optimizer, average_decay=moving_average_decay, num_updates=None) if optimizer_params.get('lookahead', None): logger.info('Using lookahead optimizer.') optimizer = tfa.optimizers.Lookahead(optimizer) return optimizer
def load_checkpoint(checkpoint, ckpt_path): logger.info('Load from checkpoint is enabled') if tf.io.gfile.isdir(ckpt_path): path_to_checkpoint = tf.train.latest_checkpoint(ckpt_path) logger.info('Latest checkpoint: {}'.format(path_to_checkpoint)) else: path_to_checkpoint = ckpt_path if tf.io.gfile.exists( ckpt_path + '.index') else None logger.info('Provided checkpoint: {}'.format(path_to_checkpoint)) if not path_to_checkpoint: logger.info('No checkpoint detected') return 0 logger.info( 'Checkpoint file {} found and restoring from checkpoint'.format( path_to_checkpoint)) status = checkpoint.restore(path_to_checkpoint) status.expect_partial() logger.info('Completed loading from checkpoint') return None
def train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step, epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer): train_summary_writer = SummaryWriter(log_dir, 'train') validation_summary_writer = SummaryWriter(log_dir, 'validation') compression_summary_writer = SummaryWriter(log_dir, 'compression') logger.info('Training started') for epoch in range(initial_epoch, epochs): logger.info('Epoch {}/{}'.format(epoch, epochs)) compression_ctrl.scheduler.epoch_step(epoch) for step, x in enumerate(train_dist_dataset): if epoch == initial_epoch and step < initial_step % steps_per_epoch: continue if step == steps_per_epoch: save_path = checkpoint_manager.save() logger.info('Saved checkpoint for epoch={}: {}'.format( epoch, save_path)) break compression_ctrl.scheduler.step() train_loss = train_step(x) train_metric_result = tf.nest.map_structure( lambda s: s.numpy().astype(float), train_loss) if np.isnan(train_metric_result['total_loss']): raise ValueError('total loss is NaN') train_metric_result.update( {'learning_rate': optimizer.lr(optimizer.iterations).numpy()}) train_summary_writer(metrics=train_metric_result, step=optimizer.iterations.numpy()) if step % 100 == 0: logger.info('Step {}/{}'.format(step, steps_per_epoch)) logger.info('Training metric = {}'.format(train_metric_result)) logger.info('Evaluation...') test_metric_result = evaluate(test_step, eval_metric, test_dist_dataset) validation_summary_writer(metrics=test_metric_result, step=optimizer.iterations.numpy()) eval_metric.reset_states() logger.info('Validation metric = {}'.format(test_metric_result)) statistics = compression_ctrl.statistics() print_statistics(statistics) statistics = { 'compression/statistics/' + key: value for key, value in statistics.items() if isinstance(value, (int, float)) } compression_summary_writer(metrics=statistics, step=optimizer.iterations.numpy()) train_summary_writer.close() validation_summary_writer.close() compression_summary_writer.close()