def main(unused_argv): assert FLAGS.data is not None, 'Provide training data path via --data.' batch_size = FLAGS.num_cores * PER_CORE_BATCH_SIZE training_steps_per_epoch = int(APPROX_IMAGENET_TRAINING_IMAGES / batch_size) validation_steps = int(IMAGENET_VALIDATION_IMAGES // batch_size) model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR logging.info('Saving tensorboard summaries at %s', model_dir) logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) tf.contrib.distribute.initialize_tpu_system(resolver) strategy = tf.contrib.distribute.TPUStrategy(resolver) logging.info('Use bfloat16: %s.', USE_BFLOAT16) logging.info('Use global batch size: %s.', batch_size) logging.info('Enable top 5 accuracy: %s.', FLAGS.eval_top_5_accuracy) logging.info('Training model using data in directory "%s".', FLAGS.data) with strategy.scope(): logging.info('Building Keras ResNet-50 model') model = resnet_model.ResNet50(num_classes=NUM_CLASSES) logging.info('Compiling model.') metrics = ['sparse_categorical_accuracy'] if FLAGS.eval_top_5_accuracy: metrics.append(sparse_top_k_categorical_accuracy) model.compile( optimizer=gradient_descent.SGD( learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy', metrics=metrics) imagenet_train = imagenet_input.ImageNetInput( is_training=True, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=USE_BFLOAT16) imagenet_eval = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=USE_BFLOAT16) lr_schedule_cb = LearningRateBatchScheduler( schedule=learning_rate_schedule_wrapper(training_steps_per_epoch)) tensorboard_cb = eval_utils.TensorBoardWithValidation( log_dir=model_dir, validation_imagenet_input=imagenet_eval, validation_steps=validation_steps, validation_epochs=[30, 60, 90]) training_callbacks = [lr_schedule_cb, tensorboard_cb] model.fit( imagenet_train.input_fn(), epochs=EPOCHS, steps_per_epoch=training_steps_per_epoch, callbacks=training_callbacks) model_saving_utils.save_model(model, model_dir, WEIGHTS_TXT)
def main(unused_argv): model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR batch_size = PER_CORE_BATCH_SIZE * FLAGS.num_cores steps_per_epoch = FLAGS.steps_per_epoch or (int( APPROX_IMAGENET_TRAINING_IMAGES // batch_size)) steps_per_eval = IMAGENET_VALIDATION_IMAGES // batch_size logging.info('Saving checkpoints at %s', model_dir) logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) imagenet_train = imagenet_input.ImageNetInput(is_training=True, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=_USE_BFLOAT16) imagenet_eval = imagenet_input.ImageNetInput(is_training=False, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=_USE_BFLOAT16) train_iterator = strategy.experimental_distribute_dataset( imagenet_train.input_fn()).make_initializable_iterator() test_iterator = strategy.experimental_distribute_dataset( imagenet_eval.input_fn()).make_initializable_iterator() with strategy.scope(): logging.info('Building Keras ResNet-50 model') model = resnet_model.ResNet50(num_classes=NUM_CLASSES) optimizer = tf.keras.optimizers.SGD(learning_rate=_BASE_LEARNING_RATE, momentum=0.9, nesterov=True) training_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'training_accuracy', dtype=tf.float32) test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'test_accuracy', dtype=tf.float32) logging.info('Finished building Keras ResNet-50 model') def train_step(inputs): """Training StepFn.""" images, labels = inputs with tf.GradientTape() as tape: predictions = model(images, training=True) # Loss calculations. # # Part 1: Prediciton loss. prediction_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, predictions) loss1 = tf.reduce_mean(prediction_loss) # Part 2: Model weights regularization loss2 = tf.reduce_sum(model.losses) # Scale the loss given the TPUStrategy will reduce sum all gradients. loss = loss1 + loss2 scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) update_vars = optimizer.apply_gradients( zip(grads, model.trainable_variables)) update_loss = training_loss.update_state(loss) update_accuracy = training_accuracy.update_state(labels, predictions) with tf.control_dependencies( [update_vars, update_loss, update_accuracy]): return tf.identity(loss) def test_step(inputs): """Evaluation StepFn.""" images, labels = inputs predictions = model(images, training=False) loss = tf.keras.losses.sparse_categorical_crossentropy( labels, predictions) loss = tf.reduce_mean(loss) update_loss = test_loss.update_state(loss) update_accuracy = test_accuracy.update_state(labels, predictions) with tf.control_dependencies([update_loss, update_accuracy]): return tf.identity(loss) dist_train = strategy.experimental_local_results( strategy.run(train_step, args=(next(train_iterator), ))) dist_test = strategy.experimental_local_results( strategy.run(test_step, args=(next(test_iterator), ))) training_loss_result = training_loss.result() training_accuracy_result = training_accuracy.result() test_loss_result = test_loss.result() test_accuracy_result = test_accuracy.result() train_iterator_init = train_iterator.initialize() test_iterator_init = test_iterator.initialize() config = tf.ConfigProto() config.allow_soft_placement = True cluster_spec = resolver.cluster_spec() if cluster_spec: config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with tf.Session(target=resolver.master(), config=config) as sess: all_variables = (tf.global_variables() + training_loss.variables + training_accuracy.variables + test_loss.variables + test_accuracy.variables) sess.run([v.initializer for v in all_variables]) sess.run(train_iterator_init) for epoch in range(0, FLAGS.num_epochs): logging.info('Starting to run epoch: %s', epoch) for step in range(steps_per_epoch): learning_rate = compute_learning_rate(epoch + 1 + (float(step) / steps_per_epoch)) sess.run(optimizer.lr.assign(learning_rate)) if step % 20 == 0: logging.info('Learning rate at step %s in epoch %s is %s', step, epoch, learning_rate) sess.run(dist_train) if step % 20 == 0: logging.info( 'Training loss: %s, accuracy: %s%%', round(sess.run(training_loss_result), 4), round(sess.run(training_accuracy_result) * 100, 2)) training_loss.reset_states() training_accuracy.reset_states() sess.run(test_iterator_init) for step in range(steps_per_eval): if step % 20 == 0: logging.info('Starting to run eval step %s of epoch: %s', step, epoch) sess.run(dist_test) if step % 20 == 0: logging.info( 'Test loss: %s, accuracy: %s%%', round(sess.run(test_loss_result), 4), round(sess.run(test_accuracy_result) * 100, 2)) test_loss.reset_states() test_accuracy.reset_states()
def main(argv): logging.info('Building Keras ResNet-50 model') model = resnet_model.ResNet50(num_classes=NUM_CLASSES) if FLAGS.use_tpu: logging.info('Converting from CPU to TPU model.') resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) strategy = tf.contrib.tpu.TPUDistributionStrategy(resolver) model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy) logging.info('Compiling model.') model.compile( optimizer=tf.keras.optimizers.SGD(lr=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) if FLAGS.data is None: training_images = np.random.randn( BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3).astype(np.float32) training_labels = np.random.randint(NUM_CLASSES, size=BATCH_SIZE, dtype=np.int32) logging.info('Training model using synthetica data.') model.fit( training_images, training_labels, epochs=EPOCHS, batch_size=BATCH_SIZE) logging.info('Evaluating the model on synthetic data.') model.evaluate(training_images, training_labels, verbose=0) else: model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR imagenet_train = imagenet_input.ImageNetInput( is_training=True, data_dir=FLAGS.data, per_core_batch_size=PER_CORE_BATCH_SIZE) logging.info('Training model using real data in directory "%s".', FLAGS.data) # If evaluating top 5 accuracy, we feed the inputs from a Python generator, # so we need to build a single batch for all of the cores, which will be # split on TPU. per_core_batch_size = ( BATCH_SIZE if FLAGS.eval_top_5_accuracy else PER_CORE_BATCH_SIZE) imagenet_validation = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.data, per_core_batch_size=per_core_batch_size) callbacks = [ LearningRateBatchScheduler(schedule=learning_rate_schedule), eval_utils.TensorBoardWithValidation( log_dir=model_dir, validation_imagenet_input=imagenet_validation, validation_steps=VALIDATION_STEPS, validation_epochs=[30, 60, 90], eval_top_k_accuracy=FLAGS.eval_top_5_accuracy), ] model.fit(imagenet_train.input_fn, epochs=EPOCHS, steps_per_epoch=TRAINING_STEPS_PER_EPOCH, callbacks=callbacks) if HAS_H5PY: weights_file = os.path.join(model_dir, WEIGHTS_TXT) logging.info('Save weights into %s', weights_file) model.save_weights(weights_file, overwrite=True)
def main(unused_argv): tf.enable_v2_behavior() num_workers = 1 job_name = 'worker' primary_cpu_task = '/job:%s' % job_name is_tpu_pod = num_workers > 1 model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR batch_size = PER_CORE_BATCH_SIZE * FLAGS.num_cores steps_per_epoch = FLAGS.steps_per_epoch or (int( APPROX_IMAGENET_TRAINING_IMAGES // batch_size)) steps_per_eval = int(1.0 * math.ceil(IMAGENET_VALIDATION_IMAGES / batch_size)) logging.info('Saving checkpoints at %s', model_dir) logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu, job_name=job_name) tf.config.experimental_connect_to_host(resolver.master()) # pylint: disable=line-too-long tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) with tf.device(primary_cpu_task): # TODO(b/130307853): In TPU Pod, we have to use # `strategy.experimental_distribute_datasets_from_function` instead of # `strategy.experimental_distribute_dataset` because dataset cannot be # cloned in eager mode. And when using # `strategy.experimental_distribute_datasets_from_function`, we should use # per core batch size instead of global batch size, because no re-batch is # happening in this case. if is_tpu_pod: imagenet_train = imagenet_input.ImageNetInput( is_training=True, data_dir=FLAGS.data, batch_size=PER_CORE_BATCH_SIZE, use_bfloat16=_USE_BFLOAT16) imagenet_eval = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.data, batch_size=PER_CORE_BATCH_SIZE, use_bfloat16=_USE_BFLOAT16) train_dataset = strategy.experimental_distribute_datasets_from_function( imagenet_train.input_fn) test_dataset = strategy.experimental_distribute_datasets_from_function( imagenet_eval.input_fn) else: imagenet_train = imagenet_input.ImageNetInput( is_training=True, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=_USE_BFLOAT16) imagenet_eval = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=_USE_BFLOAT16) train_dataset = strategy.experimental_distribute_dataset( imagenet_train.input_fn()) test_dataset = strategy.experimental_distribute_dataset( imagenet_eval.input_fn()) with strategy.scope(): logging.info('Building Keras ResNet-50 model') model = resnet_model.ResNet50(num_classes=NUM_CLASSES) optimizer = tf.keras.optimizers.SGD( learning_rate=ResnetLearningRateSchedule( steps_per_epoch, _BASE_LEARNING_RATE), momentum=0.9, nesterov=True) training_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'training_accuracy', dtype=tf.float32) test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'test_accuracy', dtype=tf.float32) logging.info('Finished building Keras ResNet-50 model') checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(model_dir) initial_epoch = 0 if latest_checkpoint: checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch # Create summary writers train_summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'summaries/train')) test_summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'summaries/test')) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs with tf.GradientTape() as tape: logits = model(images, training=True) # Loss calculations. # # Part 1: Prediction loss. prediction_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, logits) loss1 = tf.reduce_mean(prediction_loss) # Part 2: Model weights regularization loss2 = tf.reduce_sum(model.losses) # Scale the loss given the TPUStrategy will reduce sum all gradients. loss = loss1 + loss2 loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) training_loss.update_state(loss) training_accuracy.update_state(labels, logits) strategy.experimental_run_v2(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator): """Evaluation StepFn.""" def step_fn(inputs): images, labels = inputs logits = model(images, training=False) loss = tf.keras.losses.sparse_categorical_crossentropy( labels, logits) loss = tf.reduce_mean(loss) / strategy.num_replicas_in_sync test_loss.update_state(loss) test_accuracy.update_state(labels, logits) strategy.experimental_run_v2(step_fn, args=(next(iterator), )) train_iterator = iter(train_dataset) for epoch in range(initial_epoch, FLAGS.num_epochs): logging.info('Starting to run epoch: %s', epoch) with train_summary_writer.as_default(): for step in range(steps_per_epoch): if step % 20 == 0: logging.info('Running step %s in epoch %s', step, epoch) train_step(train_iterator) tf.summary.scalar('loss', training_loss.result(), step=optimizer.iterations) tf.summary.scalar('accuracy', training_accuracy.result(), step=optimizer.iterations) logging.info('Training loss: %s, accuracy: %s%%', round(training_loss.result(), 4), round(training_accuracy.result() * 100, 2)) training_loss.reset_states() training_accuracy.reset_states() with test_summary_writer.as_default(): test_iterator = iter(test_dataset) for step in range(steps_per_eval): if step % 20 == 0: logging.info( 'Starting to run eval step %s of epoch: %s', step, epoch) test_step(test_iterator) tf.summary.scalar('loss', test_loss.result(), step=optimizer.iterations) tf.summary.scalar('accuracy', test_accuracy.result(), step=optimizer.iterations) logging.info('Test loss: %s, accuracy: %s%%', round(test_loss.result(), 4), round(test_accuracy.result() * 100, 2)) test_loss.reset_states() test_accuracy.reset_states() checkpoint_name = checkpoint.save( os.path.join(model_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name)
def main(unused_argv): tf.enable_v2_behavior() model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR batch_size = PER_CORE_BATCH_SIZE * FLAGS.num_cores steps_per_epoch = FLAGS.steps_per_epoch or (int( APPROX_IMAGENET_TRAINING_IMAGES // batch_size)) steps_per_eval = int(1.0 * math.ceil(IMAGENET_VALIDATION_IMAGES / batch_size)) logging.info('Saving checkpoints at %s', model_dir) logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) imagenet_train = imagenet_input.ImageNetInput( is_training=True, data_dir=FLAGS.data, batch_size=PER_CORE_BATCH_SIZE, use_bfloat16=_USE_BFLOAT16) imagenet_eval = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.data, batch_size=PER_CORE_BATCH_SIZE, use_bfloat16=_USE_BFLOAT16) train_dataset = strategy.experimental_distribute_datasets_from_function( imagenet_train.input_fn) test_dataset = strategy.experimental_distribute_datasets_from_function( imagenet_eval.input_fn) if _USE_BFLOAT16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) with strategy.scope(): logging.info('Building Keras ResNet-50 model') model = resnet_model.ResNet50(num_classes=NUM_CLASSES) base_lr = _BASE_LEARNING_RATE * batch_size / 256 optimizer = tf.keras.optimizers.SGD( learning_rate=ResnetLearningRateSchedule(steps_per_epoch, base_lr), momentum=0.9, nesterov=True) training_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'training_accuracy', dtype=tf.float32) test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'test_accuracy', dtype=tf.float32) logging.info('Finished building Keras ResNet-50 model') checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(model_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch # Create summary writers train_summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'summaries/train')) test_summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'summaries/test')) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs with tf.GradientTape() as tape: predictions = model(images, training=True) if _USE_BFLOAT16: predictions = tf.cast(predictions, tf.float32) # Loss calculations. # # Part 1: Prediction loss. prediction_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, predictions) loss1 = tf.reduce_mean(prediction_loss) # Part 2: Model weights regularization loss2 = tf.reduce_sum(model.losses) # Scale the loss given the TPUStrategy will reduce sum all gradients. loss = loss1 + loss2 scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) training_loss.update_state(loss) training_accuracy.update_state(labels, predictions) strategy.experimental_run_v2(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator): """Evaluation StepFn.""" def step_fn(inputs): images, labels = inputs predictions = model(images, training=False) if _USE_BFLOAT16: predictions = tf.cast(predictions, tf.float32) loss = tf.keras.losses.sparse_categorical_crossentropy( labels, predictions) loss = safe_mean(loss) test_loss.update_state(loss) test_accuracy.update_state(labels, predictions) strategy.experimental_run_v2(step_fn, args=(next(iterator), )) train_iterator = iter(train_dataset) for epoch in range(initial_epoch, FLAGS.num_epochs): logging.info('Starting to run epoch: %s', epoch) with train_summary_writer.as_default(): for step in range(steps_per_epoch): if step % 20 == 0: logging.info('Running step %s in epoch %s', step, epoch) train_step(train_iterator) tf.summary.scalar('loss', training_loss.result(), step=optimizer.iterations) tf.summary.scalar('accuracy', training_accuracy.result(), step=optimizer.iterations) logging.info('Training loss: %s, accuracy: %s%%', round(training_loss.result(), 4), round(training_accuracy.result() * 100, 2)) training_loss.reset_states() training_accuracy.reset_states() with test_summary_writer.as_default(): test_iterator = iter(test_dataset) for step in range(steps_per_eval): if step % 20 == 0: logging.info('Starting to run eval step %s of epoch: %s', step, epoch) test_step(test_iterator) tf.summary.scalar('loss', test_loss.result(), step=optimizer.iterations) tf.summary.scalar('accuracy', test_accuracy.result(), step=optimizer.iterations) logging.info('Test loss: %s, accuracy: %s%%', round(test_loss.result(), 4), round(test_accuracy.result() * 100, 2)) test_loss.reset_states() test_accuracy.reset_states() checkpoint_name = checkpoint.save(os.path.join(model_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name)
def test_keras_single_step(self): resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu='') tf.contrib.distribute.initialize_tpu_system(resolver) strategy = tf.contrib.distribute.TPUStrategy(resolver) np.random.seed(0) tf.set_random_seed(0) def input_fn(): batch_size = 1024 images = np.random.randn(batch_size, *IMAGE_SHAPE).astype(np.float32) labels = np.random.randint( 0, NUM_CLASSES, size=batch_size).astype(np.float32) ds = tf.data.Dataset.from_tensor_slices((images, labels)) ds = ds.map(lambda im, labels: (tf.cast(im, tf.bfloat16), labels)) ds = ds.repeat() ds = ds.batch(batch_size, drop_remainder=True) return ds with strategy.scope(): model = resnet_model.ResNet50(num_classes=NUM_CLASSES) model.compile( optimizer=gradient_descent.SGD( learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy') # Reinitialize layers with known weights. # TODO(power) -- figure out a way to force deterministic initialization all_weights = [] for w in model.get_weights(): if len(w.shape) == 4: scale = np.sqrt(2.0 / (w.shape[0] * w.shape[1] * w.shape[-2])) all_weights.append((np.random.random_sample(w.shape) - 0.5) * scale) elif len(w.shape) == 2: scale = np.sqrt(2.0 / np.prod(w.shape)) all_weights.append((np.random.random_sample(w.shape) - 0.5) * scale) else: all_weights.append(np.zeros(w.shape)) model.set_weights(all_weights) lr_schedule_cb = LearningRateBatchScheduler( schedule=learning_rate_schedule_wrapper(1)) training_callbacks = [ lr_schedule_cb, ] model.fit( input_fn(), epochs=90, steps_per_epoch=1, callbacks=training_callbacks, verbose=0) weights = model.get_weights() golden_weights = [ (-0.0091566, 0.944489), (0.0, 0.0), (0.0, 0.0), (-0.000772487, 1.4831e-05), (110.196, 611.292), ] try: for w, gw in zip(weights, golden_weights): assert np.allclose(w.mean(), gw[0]) assert np.allclose(np.var(w), gw[1]) except: for w in weights: tf.logging.info('%s %s', w.mean(), np.var(w)) raise
def main(unused_argv): assert FLAGS.data is not None, 'Provide training data path via --data.' tf.enable_v2_behavior() tf.compat.v1.disable_eager_execution() # todo batch_size = FLAGS.num_cores * PER_CORE_BATCH_SIZE training_steps_per_epoch = FLAGS.steps_per_epoch or (int( APPROX_IMAGENET_TRAINING_IMAGES // batch_size)) validation_steps = int( math.ceil(1.0 * IMAGENET_VALIDATION_IMAGES / batch_size)) model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR logging.info('Saving tensorboard summaries at %s', model_dir) logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) logging.info('Use bfloat16: %s.', USE_BFLOAT16) logging.info('Use global batch size: %s.', batch_size) logging.info('Enable top 5 accuracy: %s.', FLAGS.eval_top_5_accuracy) logging.info('Training model using data in directory "%s".', FLAGS.data) with strategy.scope(): logging.info('Building Keras ResNet-50 model') model = resnet_model.ResNet50(num_classes=NUM_CLASSES) # model = keras_applications.mobilenet_v2.MobileNetV2(classes=NUM_CLASSES, weights=None) logging.info('Compiling model.') metrics = ['sparse_categorical_accuracy'] if FLAGS.eval_top_5_accuracy: metrics.append(sparse_top_k_categorical_accuracy) model.compile(optimizer=tf.keras.optimizers.SGD( learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy', metrics=metrics) imagenet_train = imagenet_input.ImageNetInput(is_training=True, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=USE_BFLOAT16) imagenet_eval = imagenet_input.ImageNetInput(is_training=False, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=USE_BFLOAT16) lr_schedule_cb = LearningRateBatchScheduler( schedule=learning_rate_schedule_wrapper(training_steps_per_epoch)) tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=model_dir) training_callbacks = [lr_schedule_cb, tensorboard_cb] model.fit(imagenet_train.input_fn(), epochs=FLAGS.num_epochs, steps_per_epoch=training_steps_per_epoch, callbacks=training_callbacks, validation_data=imagenet_eval.input_fn(), validation_steps=validation_steps, validation_freq=5) model_saving_utils.save_model(model, model_dir, WEIGHTS_TXT)
def main(argv): logging.info('Building Keras ResNet-50 model') model = resnet_model.ResNet50(num_classes=NUM_CLASSES) if FLAGS.use_tpu: logging.info('Converting from CPU to TPU model.') resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) strategy = tf.contrib.tpu.TPUDistributionStrategy(resolver) model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy) session_master = resolver.master() else: session_master = '' logging.info('Compiling model.') model.compile(optimizer=tf.keras.optimizers.SGD(lr=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) callbacks = [LearningRateBatchScheduler(schedule=learning_rate_schedule)] if FLAGS.model_dir: callbacks.append( tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir)) if FLAGS.data is None: training_images = np.random.randn(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3).astype(np.float32) training_labels = np.random.randint(NUM_CLASSES, size=BATCH_SIZE, dtype=np.int32) logging.info('Training model using synthetica data.') model.fit(training_images, training_labels, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks) logging.info('Evaluating the model on synthetic data.') model.evaluate(training_images, training_labels, verbose=0) else: imagenet_train = imagenet_input.ImageNetInput( is_training=True, data_dir=FLAGS.data, per_core_batch_size=PER_CORE_BATCH_SIZE) logging.info('Training model using real data in directory "%s".', FLAGS.data) model.fit(imagenet_train.input_fn, epochs=EPOCHS, steps_per_epoch=TRAINING_STEPS_PER_EPOCH, callbacks=callbacks) logging.info('Evaluating the model on the validation dataset.') if FLAGS.eval_top_5_accuracy: logging.info('Evaluating top 1 and top 5 accuracy using a Python ' 'generator.') # We feed the inputs from a Python generator, so we need to build a single # batch for all of the cores, which will be split on TPU. imagenet_eval = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.data, per_core_batch_size=BATCH_SIZE) score = eval_utils.multi_top_k_accuracy( model, imagenet_eval.evaluation_generator(K.get_session()), EVAL_STEPS) else: imagenet_eval = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.data, per_core_batch_size=PER_CORE_BATCH_SIZE) score = model.evaluate(imagenet_eval.input_fn, steps=EVAL_STEPS, verbose=1) print('Evaluation score', score) if HAS_H5PY: weights_file = os.path.join( FLAGS.model_dir if FLAGS.model_dir else '/tmp', WEIGHTS_TXT) logging.info('Save weights into %s', weights_file) model.save_weights(weights_file, overwrite=True)