def run(): """Run the model training and return evaluation output.""" resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) contrib_distribute.initialize_tpu_system(resolver) strategy = contrib_distribute.TPUStrategy(resolver) model_cls = MODELS[FLAGS.model] if FLAGS.use_synthetic_data: data = SyntheticDataset(FLAGS.batch_size) else: data = Cifar10Dataset(FLAGS.batch_size) with strategy.scope(): model = model_cls(weights=None, input_shape=data.input_shape, classes=data.num_classes) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]) history = model.fit( data.train_dataset, epochs=FLAGS.epochs, steps_per_epoch=data.num_train_images // FLAGS.batch_size, validation_data=data.test_dataset, validation_steps=data.num_test_images // FLAGS.batch_size) return history.history
def run(): """Run the model training and return evaluation output.""" resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) contrib_distribute.initialize_tpu_system(resolver) strategy = contrib_distribute.TPUStrategy(resolver, steps_per_run=100) if FLAGS.fake_data: print("Using fake data") x_train = np.random.random((BATCH_SIZE, IMG_ROWS, IMG_COLS)) y_train = np.zeros([BATCH_SIZE, 1], dtype=np.int32) x_test, y_test = x_train, y_train else: # the data, split between train and test sets print("Using real data") (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() x_train = x_train.reshape(x_train.shape[0], IMG_ROWS, IMG_COLS, 1) x_test = x_test.reshape(x_test.shape[0], IMG_ROWS, IMG_COLS, 1) input_shape = (IMG_ROWS, IMG_COLS, 1) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print("x_train shape:", x_train.shape) print(x_train.shape[0], "train samples") print(x_test.shape[0], "test samples") # convert class vectors to binary class matrices y_train = tf.keras.utils.to_categorical(y_train, NUM_CLASSES) y_test = tf.keras.utils.to_categorical(y_test, NUM_CLASSES) with strategy.scope(): model = mnist_model(input_shape) model.compile( loss=tf.keras.losses.categorical_crossentropy, optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05), metrics=["accuracy"], ) callbacks = [] if FLAGS.model_dir: callbacks = [tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir)] model.fit( x_train, y_train, batch_size=BATCH_SIZE, callbacks=callbacks, epochs=EPOCHS, verbose=1, validation_data=(x_test, y_test), ) return model.evaluate(x_test, y_test, batch_size=BATCH_SIZE, verbose=1)
def main(unused_argv): assert FLAGS.data is not None, 'Provide training data path via --data.' batch_size = FLAGS.num_cores * PER_CORE_BATCH_SIZE training_steps_per_epoch = FLAGS.steps_per_epoch or (int( APPROX_IMAGENET_TRAINING_IMAGES // batch_size)) validation_steps = int( math.ceil(1.0 * IMAGENET_VALIDATION_IMAGES / batch_size)) model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR logging.info('Saving tensorboard summaries at %s', model_dir) logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) contrib_distribute.initialize_tpu_system(resolver) strategy = contrib_distribute.TPUStrategy(resolver) logging.info('Use bfloat16: %s.', USE_BFLOAT16) logging.info('Use global batch size: %s.', batch_size) logging.info('Enable top 5 accuracy: %s.', FLAGS.eval_top_5_accuracy) logging.info('Training model using data in directory "%s".', FLAGS.data) with strategy.scope(): logging.info('Building Keras ResNet-50 model') model = resnet_model.ResNet50(num_classes=NUM_CLASSES) logging.info('Compiling model.') metrics = ['sparse_categorical_accuracy'] if FLAGS.eval_top_5_accuracy: metrics.append(sparse_top_k_categorical_accuracy) model.compile(optimizer=tf.keras.optimizers.SGD( learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy', metrics=metrics) imagenet_train = imagenet_input.ImageNetInput(is_training=True, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=USE_BFLOAT16) imagenet_eval = imagenet_input.ImageNetInput(is_training=False, data_dir=FLAGS.data, batch_size=batch_size, use_bfloat16=USE_BFLOAT16) lr_schedule_cb = LearningRateBatchScheduler( schedule=learning_rate_schedule_wrapper(training_steps_per_epoch)) tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=model_dir) training_callbacks = [lr_schedule_cb, tensorboard_cb] model.fit(imagenet_train.input_fn(), epochs=FLAGS.num_epochs, steps_per_epoch=training_steps_per_epoch, callbacks=training_callbacks, validation_data=imagenet_eval.input_fn(), validation_steps=validation_steps, validation_freq=5) model_saving_utils.save_model(model, model_dir, WEIGHTS_TXT)
def main(unused_argv): """Starts a ResNet training session.""" tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Estimator looks at the master it connects to for MonitoredTrainingSession # by reading the `TF_CONFIG` environment variable. tf_config_env = { 'session_master': tpu_cluster_resolver.get_master(), 'eval_session_master': tpu_cluster_resolver.get_master() } os.environ['TF_CONFIG'] = json.dumps(tf_config_env) steps_per_run_train = _NUM_TRAIN_IMAGES // (FLAGS.train_batch_size * FLAGS.num_cores) steps_per_run_eval = _NUM_EVAL_IMAGES // (FLAGS.eval_batch_size * FLAGS.num_cores) steps_per_eval = steps_per_run_train train_distribution = contrib_distribute.TPUStrategy( tpu_cluster_resolver, steps_per_run=steps_per_run_train) eval_distribution = contrib_distribute.TPUStrategy( tpu_cluster_resolver, steps_per_run=steps_per_run_eval) config = tf.estimator.RunConfig(model_dir=FLAGS.model_dir, train_distribute=train_distribution, eval_distribute=eval_distribution, save_checkpoints_steps=steps_per_eval, save_checkpoints_secs=None, keep_checkpoint_max=10) resnet_estimator = tf.estimator.Estimator(model_fn=model_fn, config=config) train_input, eval_input = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, use_bfloat16=(FLAGS.precision == 'bfloat16')) for is_training in [True, False] ] try: current_step = resnet_estimator.get_variable_value( tf.GraphKeys.GLOBAL_STEP) except ValueError: current_step = 0 while current_step < _TRAIN_STEPS: next_checkpoint = min(current_step + steps_per_eval, _TRAIN_STEPS) resnet_estimator.train( input_fn=lambda: train_input.input_fn( # pylint: disable=g-long-lambda {'batch_size': FLAGS.train_batch_size}), max_steps=next_checkpoint) current_step = next_checkpoint eval_results = resnet_estimator.evaluate( input_fn=lambda: eval_input.input_fn( # pylint: disable=g-long-lambda {'batch_size': FLAGS.eval_batch_size}), steps=_NUM_EVAL_IMAGES // (FLAGS.eval_batch_size * FLAGS.num_cores)) tf.logging.info('Eval results: %s' % eval_results)
def test_keras_single_step(self): resolver = contrib_cluster_resolver.TPUClusterResolver(tpu='') contrib_distribute.initialize_tpu_system(resolver) strategy = contrib_distribute.TPUStrategy(resolver) np.random.seed(0) tf.set_random_seed(0) def input_fn(): batch_size = 128 * NUM_REPLICAS images = np.random.randn(batch_size, *IMAGE_SHAPE).astype(np.float32) labels = np.random.randint( 0, NUM_CLASSES, size=batch_size).astype(np.float32) ds = tf.data.Dataset.from_tensor_slices((images, labels)) ds = ds.map(lambda im, labels: (tf.cast(im, tf.bfloat16), labels)) ds = ds.repeat() ds = ds.batch(batch_size, drop_remainder=True) return ds with strategy.scope(): model = resnet_model.ResNet50(num_classes=NUM_CLASSES) model.compile( optimizer=gradient_descent.SGD( learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy') # Reinitialize layers with known weights. # TODO(power) -- figure out a way to force deterministic initialization all_weights = [] for w in model.get_weights(): if len(w.shape) == 4: scale = np.sqrt(2.0 / (w.shape[0] * w.shape[1] * w.shape[-2])) all_weights.append((np.random.random_sample(w.shape) - 0.5) * scale) elif len(w.shape) == 2: scale = np.sqrt(2.0 / np.prod(w.shape)) all_weights.append((np.random.random_sample(w.shape) - 0.5) * scale) else: all_weights.append(np.zeros(w.shape)) model.set_weights(all_weights) lr_schedule_cb = LearningRateBatchScheduler( schedule=learning_rate_schedule_wrapper(1)) training_callbacks = [ lr_schedule_cb, ] model.fit( input_fn(), epochs=90, steps_per_epoch=1, callbacks=training_callbacks, verbose=0) weights = model.get_weights() golden_weights = [ (-0.000503229, 0.00108613), (0.0, 0.0), (0.0, 0.0), (-2.33946e-06, 3.93077e-08), (0.157237, 0.000115255), ] try: for w, gw in zip(weights, golden_weights): assert np.allclose(w.mean(), gw[0]) assert np.allclose(np.var(w), gw[1]) except: for w in weights: tf.logging.info('%s %s', w.mean(), np.var(w)) raise