def run():
  """Run the model training and return evaluation output."""
  resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
  contrib_distribute.initialize_tpu_system(resolver)
  strategy = contrib_distribute.TPUStrategy(resolver)

  model_cls = MODELS[FLAGS.model]
  if FLAGS.use_synthetic_data:
    data = SyntheticDataset(FLAGS.batch_size)
  else:
    data = Cifar10Dataset(FLAGS.batch_size)

  with strategy.scope():
    model = model_cls(weights=None, input_shape=data.input_shape,
                      classes=data.num_classes)

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])

    history = model.fit(
        data.train_dataset,
        epochs=FLAGS.epochs,
        steps_per_epoch=data.num_train_images // FLAGS.batch_size,
        validation_data=data.test_dataset,
        validation_steps=data.num_test_images // FLAGS.batch_size)

    return history.history
Example #2
0
def run():
    """Run the model training and return evaluation output."""
    resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
    contrib_distribute.initialize_tpu_system(resolver)
    strategy = contrib_distribute.TPUStrategy(resolver, steps_per_run=100)

    if FLAGS.fake_data:
        print("Using fake data")
        x_train = np.random.random((BATCH_SIZE, IMG_ROWS, IMG_COLS))
        y_train = np.zeros([BATCH_SIZE, 1], dtype=np.int32)
        x_test, y_test = x_train, y_train
    else:
        # the data, split between train and test sets
        print("Using real data")
        (x_train, y_train), (x_test,
                             y_test) = tf.keras.datasets.mnist.load_data()

    x_train = x_train.reshape(x_train.shape[0], IMG_ROWS, IMG_COLS, 1)
    x_test = x_test.reshape(x_test.shape[0], IMG_ROWS, IMG_COLS, 1)
    input_shape = (IMG_ROWS, IMG_COLS, 1)

    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")
    x_train /= 255
    x_test /= 255
    print("x_train shape:", x_train.shape)
    print(x_train.shape[0], "train samples")
    print(x_test.shape[0], "test samples")

    # convert class vectors to binary class matrices
    y_train = tf.keras.utils.to_categorical(y_train, NUM_CLASSES)
    y_test = tf.keras.utils.to_categorical(y_test, NUM_CLASSES)
    with strategy.scope():
        model = mnist_model(input_shape)
        model.compile(
            loss=tf.keras.losses.categorical_crossentropy,
            optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05),
            metrics=["accuracy"],
        )

    callbacks = []
    if FLAGS.model_dir:
        callbacks = [tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir)]

    model.fit(
        x_train,
        y_train,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        epochs=EPOCHS,
        verbose=1,
        validation_data=(x_test, y_test),
    )
    return model.evaluate(x_test, y_test, batch_size=BATCH_SIZE, verbose=1)
Example #3
0
def main(unused_argv):
    assert FLAGS.data is not None, 'Provide training data path via --data.'

    batch_size = FLAGS.num_cores * PER_CORE_BATCH_SIZE

    training_steps_per_epoch = FLAGS.steps_per_epoch or (int(
        APPROX_IMAGENET_TRAINING_IMAGES // batch_size))
    validation_steps = int(
        math.ceil(1.0 * IMAGENET_VALIDATION_IMAGES / batch_size))

    model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR
    logging.info('Saving tensorboard summaries at %s', model_dir)

    logging.info('Use TPU at %s',
                 FLAGS.tpu if FLAGS.tpu is not None else 'local')
    resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
    contrib_distribute.initialize_tpu_system(resolver)
    strategy = contrib_distribute.TPUStrategy(resolver)

    logging.info('Use bfloat16: %s.', USE_BFLOAT16)
    logging.info('Use global batch size: %s.', batch_size)
    logging.info('Enable top 5 accuracy: %s.', FLAGS.eval_top_5_accuracy)
    logging.info('Training model using data in directory "%s".', FLAGS.data)

    with strategy.scope():
        logging.info('Building Keras ResNet-50 model')
        model = resnet_model.ResNet50(num_classes=NUM_CLASSES)

        logging.info('Compiling model.')
        metrics = ['sparse_categorical_accuracy']

        if FLAGS.eval_top_5_accuracy:
            metrics.append(sparse_top_k_categorical_accuracy)

        model.compile(optimizer=tf.keras.optimizers.SGD(
            learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True),
                      loss='sparse_categorical_crossentropy',
                      metrics=metrics)

    imagenet_train = imagenet_input.ImageNetInput(is_training=True,
                                                  data_dir=FLAGS.data,
                                                  batch_size=batch_size,
                                                  use_bfloat16=USE_BFLOAT16)
    imagenet_eval = imagenet_input.ImageNetInput(is_training=False,
                                                 data_dir=FLAGS.data,
                                                 batch_size=batch_size,
                                                 use_bfloat16=USE_BFLOAT16)

    lr_schedule_cb = LearningRateBatchScheduler(
        schedule=learning_rate_schedule_wrapper(training_steps_per_epoch))
    tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=model_dir)

    training_callbacks = [lr_schedule_cb, tensorboard_cb]

    model.fit(imagenet_train.input_fn(),
              epochs=FLAGS.num_epochs,
              steps_per_epoch=training_steps_per_epoch,
              callbacks=training_callbacks,
              validation_data=imagenet_eval.input_fn(),
              validation_steps=validation_steps,
              validation_freq=5)

    model_saving_utils.save_model(model, model_dir, WEIGHTS_TXT)
Example #4
0
def main(unused_argv):
    """Starts a ResNet training session."""
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    # Estimator looks at the master it connects to for MonitoredTrainingSession
    # by reading the `TF_CONFIG` environment variable.
    tf_config_env = {
        'session_master': tpu_cluster_resolver.get_master(),
        'eval_session_master': tpu_cluster_resolver.get_master()
    }
    os.environ['TF_CONFIG'] = json.dumps(tf_config_env)

    steps_per_run_train = _NUM_TRAIN_IMAGES // (FLAGS.train_batch_size *
                                                FLAGS.num_cores)
    steps_per_run_eval = _NUM_EVAL_IMAGES // (FLAGS.eval_batch_size *
                                              FLAGS.num_cores)
    steps_per_eval = steps_per_run_train

    train_distribution = contrib_distribute.TPUStrategy(
        tpu_cluster_resolver, steps_per_run=steps_per_run_train)
    eval_distribution = contrib_distribute.TPUStrategy(
        tpu_cluster_resolver, steps_per_run=steps_per_run_eval)
    config = tf.estimator.RunConfig(model_dir=FLAGS.model_dir,
                                    train_distribute=train_distribution,
                                    eval_distribute=eval_distribution,
                                    save_checkpoints_steps=steps_per_eval,
                                    save_checkpoints_secs=None,
                                    keep_checkpoint_max=10)

    resnet_estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)

    train_input, eval_input = [
        imagenet_input.ImageNetInput(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=FLAGS.transpose_input,
            use_bfloat16=(FLAGS.precision == 'bfloat16'))
        for is_training in [True, False]
    ]

    try:
        current_step = resnet_estimator.get_variable_value(
            tf.GraphKeys.GLOBAL_STEP)
    except ValueError:
        current_step = 0

    while current_step < _TRAIN_STEPS:
        next_checkpoint = min(current_step + steps_per_eval, _TRAIN_STEPS)

        resnet_estimator.train(
            input_fn=lambda: train_input.input_fn(  # pylint: disable=g-long-lambda
                {'batch_size': FLAGS.train_batch_size}),
            max_steps=next_checkpoint)
        current_step = next_checkpoint

        eval_results = resnet_estimator.evaluate(
            input_fn=lambda: eval_input.input_fn(  # pylint: disable=g-long-lambda
                {'batch_size': FLAGS.eval_batch_size}),
            steps=_NUM_EVAL_IMAGES //
            (FLAGS.eval_batch_size * FLAGS.num_cores))

        tf.logging.info('Eval results: %s' % eval_results)
Example #5
0
  def test_keras_single_step(self):
    resolver = contrib_cluster_resolver.TPUClusterResolver(tpu='')
    contrib_distribute.initialize_tpu_system(resolver)
    strategy = contrib_distribute.TPUStrategy(resolver)
    np.random.seed(0)
    tf.set_random_seed(0)

    def input_fn():
      batch_size = 128 * NUM_REPLICAS
      images = np.random.randn(batch_size, *IMAGE_SHAPE).astype(np.float32)
      labels = np.random.randint(
          0, NUM_CLASSES, size=batch_size).astype(np.float32)

      ds = tf.data.Dataset.from_tensor_slices((images, labels))
      ds = ds.map(lambda im, labels: (tf.cast(im, tf.bfloat16), labels))
      ds = ds.repeat()
      ds = ds.batch(batch_size, drop_remainder=True)
      return ds

    with strategy.scope():
      model = resnet_model.ResNet50(num_classes=NUM_CLASSES)

      model.compile(
          optimizer=gradient_descent.SGD(
              learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True),
          loss='sparse_categorical_crossentropy')

      # Reinitialize layers with known weights.
      # TODO(power) -- figure out a way to force deterministic initialization
      all_weights = []
      for w in model.get_weights():
        if len(w.shape) == 4:
          scale = np.sqrt(2.0 / (w.shape[0] * w.shape[1] * w.shape[-2]))
          all_weights.append((np.random.random_sample(w.shape) - 0.5) * scale)
        elif len(w.shape) == 2:
          scale = np.sqrt(2.0 / np.prod(w.shape))
          all_weights.append((np.random.random_sample(w.shape) - 0.5) * scale)
        else:
          all_weights.append(np.zeros(w.shape))
      model.set_weights(all_weights)

    lr_schedule_cb = LearningRateBatchScheduler(
        schedule=learning_rate_schedule_wrapper(1))
    training_callbacks = [
        lr_schedule_cb,
    ]

    model.fit(
        input_fn(),
        epochs=90,
        steps_per_epoch=1,
        callbacks=training_callbacks,
        verbose=0)

    weights = model.get_weights()
    golden_weights = [
        (-0.000503229, 0.00108613),
        (0.0, 0.0),
        (0.0, 0.0),
        (-2.33946e-06, 3.93077e-08),
        (0.157237, 0.000115255),
    ]
    try:
      for w, gw in zip(weights, golden_weights):
        assert np.allclose(w.mean(), gw[0])
        assert np.allclose(np.var(w), gw[1])
    except:
      for w in weights:
        tf.logging.info('%s %s', w.mean(), np.var(w))
      raise