def _instantiate_optimizers(strategy, learning_rate, beta_1, train_settings): LOGGER.info(" -------- Creating Optimizers --------") with strategy.scope(): srfr_optimizer = NovoGrad( learning_rate=learning_rate, beta_1=beta_1, beta_2=train_settings["beta_2"], weight_decay=train_settings["weight_decay"], name="novograd_srfr", ) srfr_optimizer = mixed_precision.LossScaleOptimizer( srfr_optimizer, loss_scale="dynamic", ) discriminator_optimizer = NovoGrad( learning_rate=learning_rate, beta_1=beta_1, beta_2=train_settings["beta_2"], weight_decay=train_settings["weight_decay"], name="novograd_discriminator", ) discriminator_optimizer = mixed_precision.LossScaleOptimizer( discriminator_optimizer, loss_scale="dynamic") return ( srfr_optimizer, discriminator_optimizer, )
def test_sparse_sample(self): self.run_sparse_sample( iterations=1, expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], optimizer=NovoGrad(lr=0.1, epsilon=1e-8), )
def test_sparse_sample_with_weight_decay(self): self.run_sparse_sample( iterations=1, expected=[[0.945278642, 1.8905572849], [2.9100000012, 3.8800000016]], optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8), )
def test_sparse_sample(dtype): run_sparse_sample( iterations=2, expected=[[0.71, 2.0], [3.0, 3.71]], optimizer=NovoGrad(lr=0.1, epsilon=1e-8), dtype=dtype, )
def test_sparse_sample_with_grad_averaging(self): self.run_sparse_sample( iterations=2, expected=[[0.9105572849, 1.8211145698], [2.8800000024, 3.8400000032]], optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), )
def test_dense_sample_with_grad_averaging(dtype): run_dense_sample( iterations=2, expected=[[0.9105572849, 1.8211145698], [2.8800000024, 3.8400000032]], optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), dtype=dtype, )
def test_dense_sample_with_weight_decay(dtype): run_dense_sample( iterations=1, expected=[[0.945278642, 1.8905572849], [2.9100000012, 3.8800000016]], optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8), dtype=dtype, )
def test_dense_sample(dtype): run_dense_sample( iterations=1, expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], optimizer=NovoGrad(lr=0.1, epsilon=1e-8), dtype=dtype, )
def test_sparse_sample_with_grad_averaging(dtype): run_sparse_sample( iterations=2, expected=[[0.8, 2.0], [3.0, 3.8]], optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), dtype=dtype, )
def test_sparse_sample_with_weight_decay(dtype): run_sparse_sample( iterations=2, expected=[[0.6821, 2.0], [3.0, 3.5954]], optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8), dtype=dtype, )
def test_fit_simple_linear_model(): np.random.seed(0x2020) tf.random.set_seed(0x2020) x = np.random.standard_normal((100000, 3)) w = np.random.standard_normal((3, 1)) y = np.dot(x, w) + np.random.standard_normal((100000, 1)) * 1e-5 model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3, ), units=1)) model.compile(NovoGrad(), loss="mse") model.fit(x, y, epochs=2) x = np.random.standard_normal((100, 3)) y = np.dot(x, w) predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) assert max_abs_diff < 1e-2
def test_serialization(): optimizer = NovoGrad(lr=1e-4, weight_decay=0.0, grad_averaging=False) config = tf.keras.optimizers.serialize(optimizer) new_optimizer = tf.keras.optimizers.deserialize(config) assert new_optimizer.get_config() == optimizer.get_config()
def test_get_config(): opt = NovoGrad(lr=1e-4, weight_decay=0.0, grad_averaging=False) config = opt.get_config() assert config["learning_rate"] == 1e-4 assert config["weight_decay"] == 0.0 assert config["grad_averaging"] is False
def main(): """Main training function.""" timing = TimingLogger() timing.start() network_settings, train_settings, preprocess_settings = parseConfigsFile( ['network', 'train', 'preprocess']) strategy = tf.distribute.MirroredStrategy() BATCH_SIZE = train_settings['batch_size'] * strategy.num_replicas_in_sync temp_folder = Path.cwd().joinpath('temp', 'synthetic_ds') LOGGER.info(' -------- Importing Datasets --------') vgg_dataset = VggFace2(mode='concatenated') synthetic_dataset = vgg_dataset.get_dataset() synthetic_dataset = vgg_dataset.augment_dataset() synthetic_dataset = vgg_dataset.normalize_dataset() synthetic_dataset = synthetic_dataset.cache(str(temp_folder)) #synthetic_dataset_len = vgg_dataset.get_dataset_size() synthetic_dataset_len = 100_000 synthetic_num_classes = vgg_dataset.get_number_of_classes() synthetic_dataset = synthetic_dataset.shuffle( buffer_size=2_048).repeat().batch(BATCH_SIZE).prefetch(1) lfw_path = Path.cwd().joinpath('temp', 'lfw') lfw_dataset = LFW() (left_pairs, left_aug_pairs, right_pairs, right_aug_pairs, is_same_list) = lfw_dataset.get_dataset() left_pairs = left_pairs.batch(BATCH_SIZE).cache( str(lfw_path.joinpath('left'))).prefetch(AUTOTUNE) left_aug_pairs = left_aug_pairs.batch(BATCH_SIZE).cache( str(lfw_path.joinpath('left_aug'))).prefetch(AUTOTUNE) right_pairs = right_pairs.batch(BATCH_SIZE).cache( str(lfw_path.joinpath('right'))).prefetch(AUTOTUNE) right_aug_pairs = right_aug_pairs.batch(BATCH_SIZE).cache( str(lfw_path.joinpath('right_aug'))).prefetch(AUTOTUNE) # Using `distribute_dataset` to distribute the batches across the GPUs synthetic_dataset = strategy.experimental_distribute_dataset( synthetic_dataset) left_pairs = strategy.experimental_distribute_dataset(left_pairs) left_aug_pairs = strategy.experimental_distribute_dataset(left_aug_pairs) right_pairs = strategy.experimental_distribute_dataset(right_pairs) right_aug_pairs = strategy.experimental_distribute_dataset(right_aug_pairs) LOGGER.info(' -------- Creating Models and Optimizers --------') EPOCHS = generate_num_epochs( train_settings['iterations'], synthetic_dataset_len, BATCH_SIZE, ) with strategy.scope(): srfr_model = SRFR( num_filters=network_settings['num_filters'], depth=50, categories=network_settings['embedding_size'], num_gc=network_settings['gc'], num_blocks=network_settings['num_blocks'], residual_scailing=network_settings['residual_scailing'], training=True, input_shape=preprocess_settings['image_shape_low_resolution'], num_classes_syn=synthetic_num_classes, ) sr_discriminator_model = DiscriminatorNetwork() srfr_optimizer = NovoGrad( learning_rate=train_settings['learning_rate'], beta_1=train_settings['momentum'], beta_2=train_settings['beta_2'], weight_decay=train_settings['weight_decay'], name='novograd_srfr', ) srfr_optimizer = mixed_precision.LossScaleOptimizer( srfr_optimizer, loss_scale='dynamic', ) discriminator_optimizer = NovoGrad( learning_rate=train_settings['learning_rate'], beta_1=train_settings['momentum'], beta_2=train_settings['beta_2'], weight_decay=train_settings['weight_decay'], name='novograd_discriminator', ) discriminator_optimizer = mixed_precision.LossScaleOptimizer( discriminator_optimizer, loss_scale='dynamic') train_loss = partial( strategy.reduce, reduce_op=tf.distribute.ReduceOp.MEAN, axis=0, ) checkpoint = tf.train.Checkpoint( epoch=tf.Variable(1), step=tf.Variable(1), srfr_model=srfr_model, sr_discriminator_model=sr_discriminator_model, srfr_optimizer=srfr_optimizer, discriminator_optimizer=discriminator_optimizer, ) manager = tf.train.CheckpointManager(checkpoint, directory='./training_checkpoints', max_to_keep=5) current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_summary_writer = tf.summary.create_file_writer( str(Path.cwd().joinpath('logs', 'gradient_tape', current_time, 'train')), ) test_summary_writer = tf.summary.create_file_writer( str(Path.cwd().joinpath('logs', 'gradient_tape', current_time, 'test')), ) LOGGER.info(' -------- Starting Training --------') with strategy.scope(): checkpoint.restore(manager.latest_checkpoint) if manager.latest_checkpoint: LOGGER.info(f' Restored from {manager.latest_checkpoint}') else: LOGGER.info(' Initializing from scratch.') for epoch in range(int(checkpoint.epoch), EPOCHS + 1): timing.start(Train.__name__) LOGGER.info(f' Start of epoch {epoch}') train = Train(strategy, srfr_model, srfr_optimizer, sr_discriminator_model, discriminator_optimizer, train_summary_writer, test_summary_writer, checkpoint, manager) srfr_loss, discriminator_loss = train.train_srfr_model( BATCH_SIZE, train_loss, synthetic_dataset, synthetic_num_classes, left_pairs, left_aug_pairs, right_pairs, right_aug_pairs, is_same_list, sr_weight=train_settings['super_resolution_weight'], scale=train_settings['scale'], margin=train_settings['angular_margin'], # natural_ds, # num_classes_natural, ) elapsed_time = timing.end(Train.__name__, True) with train_summary_writer.as_default(): tf.summary.scalar('srfr_loss_per_epoch', srfr_loss, step=epoch) tf.summary.scalar( 'discriminator_loss_per_epoch', discriminator_loss, step=epoch, ) tf.summary.scalar('training_time_per_epoch', elapsed_time, step=epoch) LOGGER.info((f' Epoch {epoch}, SRFR Loss: {srfr_loss:.3f},' f' Discriminator Loss: {discriminator_loss:.3f}')) train.save_model() checkpoint.epoch.assign_add(1)
def test_get_config(self): opt = NovoGrad(lr=1e-4, weight_decay=0.0, grad_averaging=False) config = opt.get_config() self.assertEqual(config["learning_rate"], 1e-4) self.assertEqual(config["weight_decay"], 0.0) self.assertEqual(config["grad_averaging"], False)