コード例 #1
0
def main(unused_argv):
  assert FLAGS.data is not None, 'Provide training data path via --data.'

  batch_size = FLAGS.num_cores * PER_CORE_BATCH_SIZE
  training_steps_per_epoch = int(APPROX_IMAGENET_TRAINING_IMAGES / batch_size)
  validation_steps = int(IMAGENET_VALIDATION_IMAGES // batch_size)

  model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR
  logging.info('Saving tensorboard summaries at %s', model_dir)

  logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local')
  resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
  tf.contrib.distribute.initialize_tpu_system(resolver)
  strategy = tf.contrib.distribute.TPUStrategy(resolver)

  logging.info('Use bfloat16: %s.', USE_BFLOAT16)
  logging.info('Use global batch size: %s.', batch_size)
  logging.info('Enable top 5 accuracy: %s.', FLAGS.eval_top_5_accuracy)
  logging.info('Training model using data in directory "%s".', FLAGS.data)

  with strategy.scope():
    logging.info('Building Keras ResNet-50 model')
    model = resnet_model.ResNet50(num_classes=NUM_CLASSES)

    logging.info('Compiling model.')
    metrics = ['sparse_categorical_accuracy']

    if FLAGS.eval_top_5_accuracy:
      metrics.append(sparse_top_k_categorical_accuracy)

    model.compile(
        optimizer=gradient_descent.SGD(
            learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True),
        loss='sparse_categorical_crossentropy',
        metrics=metrics)

  imagenet_train = imagenet_input.ImageNetInput(
      is_training=True, data_dir=FLAGS.data, batch_size=batch_size,
      use_bfloat16=USE_BFLOAT16)
  imagenet_eval = imagenet_input.ImageNetInput(
      is_training=False, data_dir=FLAGS.data, batch_size=batch_size,
      use_bfloat16=USE_BFLOAT16)

  lr_schedule_cb = LearningRateBatchScheduler(
      schedule=learning_rate_schedule_wrapper(training_steps_per_epoch))
  tensorboard_cb = eval_utils.TensorBoardWithValidation(
      log_dir=model_dir,
      validation_imagenet_input=imagenet_eval,
      validation_steps=validation_steps,
      validation_epochs=[30, 60, 90])

  training_callbacks = [lr_schedule_cb, tensorboard_cb]

  model.fit(
      imagenet_train.input_fn(),
      epochs=EPOCHS,
      steps_per_epoch=training_steps_per_epoch,
      callbacks=training_callbacks)

  model_saving_utils.save_model(model, model_dir, WEIGHTS_TXT)
コード例 #2
0
def main(unused_argv):

    model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR
    batch_size = PER_CORE_BATCH_SIZE * FLAGS.num_cores
    steps_per_epoch = FLAGS.steps_per_epoch or (int(
        APPROX_IMAGENET_TRAINING_IMAGES // batch_size))
    steps_per_eval = IMAGENET_VALIDATION_IMAGES // batch_size

    logging.info('Saving checkpoints at %s', model_dir)

    logging.info('Use TPU at %s',
                 FLAGS.tpu if FLAGS.tpu is not None else 'local')
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

    imagenet_train = imagenet_input.ImageNetInput(is_training=True,
                                                  data_dir=FLAGS.data,
                                                  batch_size=batch_size,
                                                  use_bfloat16=_USE_BFLOAT16)
    imagenet_eval = imagenet_input.ImageNetInput(is_training=False,
                                                 data_dir=FLAGS.data,
                                                 batch_size=batch_size,
                                                 use_bfloat16=_USE_BFLOAT16)

    train_iterator = strategy.experimental_distribute_dataset(
        imagenet_train.input_fn()).make_initializable_iterator()
    test_iterator = strategy.experimental_distribute_dataset(
        imagenet_eval.input_fn()).make_initializable_iterator()

    with strategy.scope():
        logging.info('Building Keras ResNet-50 model')
        model = resnet_model.ResNet50(num_classes=NUM_CLASSES)
        optimizer = tf.keras.optimizers.SGD(learning_rate=_BASE_LEARNING_RATE,
                                            momentum=0.9,
                                            nesterov=True)
        training_loss = tf.keras.metrics.Mean('training_loss',
                                              dtype=tf.float32)
        training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'training_accuracy', dtype=tf.float32)
        test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'test_accuracy', dtype=tf.float32)
        logging.info('Finished building Keras ResNet-50 model')

    def train_step(inputs):
        """Training StepFn."""
        images, labels = inputs
        with tf.GradientTape() as tape:
            predictions = model(images, training=True)

            # Loss calculations.
            #
            # Part 1: Prediciton loss.
            prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
                labels, predictions)
            loss1 = tf.reduce_mean(prediction_loss)
            # Part 2: Model weights regularization
            loss2 = tf.reduce_sum(model.losses)

            # Scale the loss given the TPUStrategy will reduce sum all gradients.
            loss = loss1 + loss2
            scaled_loss = loss / strategy.num_replicas_in_sync

        grads = tape.gradient(scaled_loss, model.trainable_variables)
        update_vars = optimizer.apply_gradients(
            zip(grads, model.trainable_variables))
        update_loss = training_loss.update_state(loss)
        update_accuracy = training_accuracy.update_state(labels, predictions)
        with tf.control_dependencies(
            [update_vars, update_loss, update_accuracy]):
            return tf.identity(loss)

    def test_step(inputs):
        """Evaluation StepFn."""
        images, labels = inputs
        predictions = model(images, training=False)
        loss = tf.keras.losses.sparse_categorical_crossentropy(
            labels, predictions)
        loss = tf.reduce_mean(loss)
        update_loss = test_loss.update_state(loss)
        update_accuracy = test_accuracy.update_state(labels, predictions)
        with tf.control_dependencies([update_loss, update_accuracy]):
            return tf.identity(loss)

    dist_train = strategy.experimental_local_results(
        strategy.run(train_step, args=(next(train_iterator), )))
    dist_test = strategy.experimental_local_results(
        strategy.run(test_step, args=(next(test_iterator), )))

    training_loss_result = training_loss.result()
    training_accuracy_result = training_accuracy.result()
    test_loss_result = test_loss.result()
    test_accuracy_result = test_accuracy.result()

    train_iterator_init = train_iterator.initialize()
    test_iterator_init = test_iterator.initialize()

    config = tf.ConfigProto()
    config.allow_soft_placement = True
    cluster_spec = resolver.cluster_spec()
    if cluster_spec:
        config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
    with tf.Session(target=resolver.master(), config=config) as sess:
        all_variables = (tf.global_variables() + training_loss.variables +
                         training_accuracy.variables + test_loss.variables +
                         test_accuracy.variables)
        sess.run([v.initializer for v in all_variables])
        sess.run(train_iterator_init)

        for epoch in range(0, FLAGS.num_epochs):
            logging.info('Starting to run epoch: %s', epoch)
            for step in range(steps_per_epoch):
                learning_rate = compute_learning_rate(epoch + 1 +
                                                      (float(step) /
                                                       steps_per_epoch))
                sess.run(optimizer.lr.assign(learning_rate))
                if step % 20 == 0:
                    logging.info('Learning rate at step %s in epoch %s is %s',
                                 step, epoch, learning_rate)
                sess.run(dist_train)
                if step % 20 == 0:
                    logging.info(
                        'Training loss: %s, accuracy: %s%%',
                        round(sess.run(training_loss_result), 4),
                        round(sess.run(training_accuracy_result) * 100, 2))
                training_loss.reset_states()
                training_accuracy.reset_states()

            sess.run(test_iterator_init)
            for step in range(steps_per_eval):
                if step % 20 == 0:
                    logging.info('Starting to run eval step %s of epoch: %s',
                                 step, epoch)
                sess.run(dist_test)
                if step % 20 == 0:
                    logging.info(
                        'Test loss: %s, accuracy: %s%%',
                        round(sess.run(test_loss_result), 4),
                        round(sess.run(test_accuracy_result) * 100, 2))
                test_loss.reset_states()
                test_accuracy.reset_states()
コード例 #3
0
def main(argv):
  logging.info('Building Keras ResNet-50 model')
  model = resnet_model.ResNet50(num_classes=NUM_CLASSES)

  if FLAGS.use_tpu:
    logging.info('Converting from CPU to TPU model.')
    resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
    strategy = tf.contrib.tpu.TPUDistributionStrategy(resolver)
    model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy)

  logging.info('Compiling model.')
  model.compile(
      optimizer=tf.keras.optimizers.SGD(lr=BASE_LEARNING_RATE,
                                        momentum=0.9,
                                        nesterov=True),
      loss='sparse_categorical_crossentropy',
      metrics=['sparse_categorical_accuracy'])

  if FLAGS.data is None:
    training_images = np.random.randn(
        BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3).astype(np.float32)
    training_labels = np.random.randint(NUM_CLASSES, size=BATCH_SIZE,
                                        dtype=np.int32)
    logging.info('Training model using synthetica data.')
    model.fit(
        training_images,
        training_labels,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE)
    logging.info('Evaluating the model on synthetic data.')
    model.evaluate(training_images, training_labels, verbose=0)
  else:
    model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR
    imagenet_train = imagenet_input.ImageNetInput(
        is_training=True,
        data_dir=FLAGS.data,
        per_core_batch_size=PER_CORE_BATCH_SIZE)
    logging.info('Training model using real data in directory "%s".',
                 FLAGS.data)
    # If evaluating top 5 accuracy, we feed the inputs from a Python generator,
    # so we need to build a single batch for all of the cores, which will be
    # split on TPU.
    per_core_batch_size = (
        BATCH_SIZE if FLAGS.eval_top_5_accuracy else PER_CORE_BATCH_SIZE)
    imagenet_validation = imagenet_input.ImageNetInput(
        is_training=False,
        data_dir=FLAGS.data,
        per_core_batch_size=per_core_batch_size)

    callbacks = [
        LearningRateBatchScheduler(schedule=learning_rate_schedule),
        eval_utils.TensorBoardWithValidation(
            log_dir=model_dir,
            validation_imagenet_input=imagenet_validation,
            validation_steps=VALIDATION_STEPS,
            validation_epochs=[30, 60, 90],
            eval_top_k_accuracy=FLAGS.eval_top_5_accuracy),
    ]

    model.fit(imagenet_train.input_fn,
              epochs=EPOCHS,
              steps_per_epoch=TRAINING_STEPS_PER_EPOCH,
              callbacks=callbacks)

    if HAS_H5PY:
      weights_file = os.path.join(model_dir, WEIGHTS_TXT)
      logging.info('Save weights into %s', weights_file)
      model.save_weights(weights_file, overwrite=True)
コード例 #4
0
def main(unused_argv):
    tf.enable_v2_behavior()
    num_workers = 1
    job_name = 'worker'
    primary_cpu_task = '/job:%s' % job_name

    is_tpu_pod = num_workers > 1
    model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR
    batch_size = PER_CORE_BATCH_SIZE * FLAGS.num_cores
    steps_per_epoch = FLAGS.steps_per_epoch or (int(
        APPROX_IMAGENET_TRAINING_IMAGES // batch_size))
    steps_per_eval = int(1.0 *
                         math.ceil(IMAGENET_VALIDATION_IMAGES / batch_size))

    logging.info('Saving checkpoints at %s', model_dir)

    logging.info('Use TPU at %s',
                 FLAGS.tpu if FLAGS.tpu is not None else 'local')
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
        tpu=FLAGS.tpu, job_name=job_name)
    tf.config.experimental_connect_to_host(resolver.master())  # pylint: disable=line-too-long
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

    with tf.device(primary_cpu_task):
        # TODO(b/130307853): In TPU Pod, we have to use
        # `strategy.experimental_distribute_datasets_from_function` instead of
        # `strategy.experimental_distribute_dataset` because dataset cannot be
        # cloned in eager mode. And when using
        # `strategy.experimental_distribute_datasets_from_function`, we should use
        # per core batch size instead of global batch size, because no re-batch is
        # happening in this case.
        if is_tpu_pod:
            imagenet_train = imagenet_input.ImageNetInput(
                is_training=True,
                data_dir=FLAGS.data,
                batch_size=PER_CORE_BATCH_SIZE,
                use_bfloat16=_USE_BFLOAT16)
            imagenet_eval = imagenet_input.ImageNetInput(
                is_training=False,
                data_dir=FLAGS.data,
                batch_size=PER_CORE_BATCH_SIZE,
                use_bfloat16=_USE_BFLOAT16)
            train_dataset = strategy.experimental_distribute_datasets_from_function(
                imagenet_train.input_fn)
            test_dataset = strategy.experimental_distribute_datasets_from_function(
                imagenet_eval.input_fn)
        else:
            imagenet_train = imagenet_input.ImageNetInput(
                is_training=True,
                data_dir=FLAGS.data,
                batch_size=batch_size,
                use_bfloat16=_USE_BFLOAT16)
            imagenet_eval = imagenet_input.ImageNetInput(
                is_training=False,
                data_dir=FLAGS.data,
                batch_size=batch_size,
                use_bfloat16=_USE_BFLOAT16)
            train_dataset = strategy.experimental_distribute_dataset(
                imagenet_train.input_fn())
            test_dataset = strategy.experimental_distribute_dataset(
                imagenet_eval.input_fn())

        with strategy.scope():
            logging.info('Building Keras ResNet-50 model')
            model = resnet_model.ResNet50(num_classes=NUM_CLASSES)
            optimizer = tf.keras.optimizers.SGD(
                learning_rate=ResnetLearningRateSchedule(
                    steps_per_epoch, _BASE_LEARNING_RATE),
                momentum=0.9,
                nesterov=True)
            training_loss = tf.keras.metrics.Mean('training_loss',
                                                  dtype=tf.float32)
            training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
                'training_accuracy', dtype=tf.float32)
            test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
            test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
                'test_accuracy', dtype=tf.float32)
            logging.info('Finished building Keras ResNet-50 model')

        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
        latest_checkpoint = tf.train.latest_checkpoint(model_dir)
        initial_epoch = 0
        if latest_checkpoint:
            checkpoint.restore(latest_checkpoint)
            logging.info('Loaded checkpoint %s', latest_checkpoint)
            initial_epoch = optimizer.iterations.numpy() // steps_per_epoch

        # Create summary writers
        train_summary_writer = tf.summary.create_file_writer(
            os.path.join(model_dir, 'summaries/train'))
        test_summary_writer = tf.summary.create_file_writer(
            os.path.join(model_dir, 'summaries/test'))

        @tf.function
        def train_step(iterator):
            """Training StepFn."""
            def step_fn(inputs):
                """Per-Replica StepFn."""
                images, labels = inputs
                with tf.GradientTape() as tape:
                    logits = model(images, training=True)

                    # Loss calculations.
                    #
                    # Part 1: Prediction loss.
                    prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
                        labels, logits)
                    loss1 = tf.reduce_mean(prediction_loss)
                    # Part 2: Model weights regularization
                    loss2 = tf.reduce_sum(model.losses)

                    # Scale the loss given the TPUStrategy will reduce sum all gradients.
                    loss = loss1 + loss2
                    loss = loss / strategy.num_replicas_in_sync

                grads = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))
                training_loss.update_state(loss)
                training_accuracy.update_state(labels, logits)

            strategy.experimental_run_v2(step_fn, args=(next(iterator), ))

        @tf.function
        def test_step(iterator):
            """Evaluation StepFn."""
            def step_fn(inputs):
                images, labels = inputs
                logits = model(images, training=False)
                loss = tf.keras.losses.sparse_categorical_crossentropy(
                    labels, logits)
                loss = tf.reduce_mean(loss) / strategy.num_replicas_in_sync
                test_loss.update_state(loss)
                test_accuracy.update_state(labels, logits)

            strategy.experimental_run_v2(step_fn, args=(next(iterator), ))

        train_iterator = iter(train_dataset)
        for epoch in range(initial_epoch, FLAGS.num_epochs):
            logging.info('Starting to run epoch: %s', epoch)
            with train_summary_writer.as_default():
                for step in range(steps_per_epoch):
                    if step % 20 == 0:
                        logging.info('Running step %s in epoch %s', step,
                                     epoch)
                    train_step(train_iterator)
                tf.summary.scalar('loss',
                                  training_loss.result(),
                                  step=optimizer.iterations)
                tf.summary.scalar('accuracy',
                                  training_accuracy.result(),
                                  step=optimizer.iterations)
                logging.info('Training loss: %s, accuracy: %s%%',
                             round(training_loss.result(), 4),
                             round(training_accuracy.result() * 100, 2))
                training_loss.reset_states()
                training_accuracy.reset_states()

            with test_summary_writer.as_default():
                test_iterator = iter(test_dataset)
                for step in range(steps_per_eval):
                    if step % 20 == 0:
                        logging.info(
                            'Starting to run eval step %s of epoch: %s', step,
                            epoch)
                    test_step(test_iterator)
                tf.summary.scalar('loss',
                                  test_loss.result(),
                                  step=optimizer.iterations)
                tf.summary.scalar('accuracy',
                                  test_accuracy.result(),
                                  step=optimizer.iterations)
                logging.info('Test loss: %s, accuracy: %s%%',
                             round(test_loss.result(), 4),
                             round(test_accuracy.result() * 100, 2))
                test_loss.reset_states()
                test_accuracy.reset_states()

            checkpoint_name = checkpoint.save(
                os.path.join(model_dir, 'checkpoint'))
            logging.info('Saved checkpoint to %s', checkpoint_name)
コード例 #5
0
ファイル: resnet50_ctl_tf2.py プロジェクト: zysxjtu/tpu
def main(unused_argv):
    tf.enable_v2_behavior()
    model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR
    batch_size = PER_CORE_BATCH_SIZE * FLAGS.num_cores
    steps_per_epoch = FLAGS.steps_per_epoch or (int(
        APPROX_IMAGENET_TRAINING_IMAGES // batch_size))
    steps_per_eval = int(1.0 *
                         math.ceil(IMAGENET_VALIDATION_IMAGES / batch_size))
    logging.info('Saving checkpoints at %s', model_dir)
    logging.info('Use TPU at %s',
                 FLAGS.tpu if FLAGS.tpu is not None else 'local')

    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

    imagenet_train = imagenet_input.ImageNetInput(
        is_training=True,
        data_dir=FLAGS.data,
        batch_size=PER_CORE_BATCH_SIZE,
        use_bfloat16=_USE_BFLOAT16)
    imagenet_eval = imagenet_input.ImageNetInput(
        is_training=False,
        data_dir=FLAGS.data,
        batch_size=PER_CORE_BATCH_SIZE,
        use_bfloat16=_USE_BFLOAT16)
    train_dataset = strategy.experimental_distribute_datasets_from_function(
        imagenet_train.input_fn)
    test_dataset = strategy.experimental_distribute_datasets_from_function(
        imagenet_eval.input_fn)

    if _USE_BFLOAT16:
        policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
        tf.keras.mixed_precision.experimental.set_policy(policy)

    with strategy.scope():
        logging.info('Building Keras ResNet-50 model')
        model = resnet_model.ResNet50(num_classes=NUM_CLASSES)
        base_lr = _BASE_LEARNING_RATE * batch_size / 256
        optimizer = tf.keras.optimizers.SGD(
            learning_rate=ResnetLearningRateSchedule(steps_per_epoch, base_lr),
            momentum=0.9,
            nesterov=True)
        training_loss = tf.keras.metrics.Mean('training_loss',
                                              dtype=tf.float32)
        training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'training_accuracy', dtype=tf.float32)
        test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'test_accuracy', dtype=tf.float32)
        logging.info('Finished building Keras ResNet-50 model')

        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
        latest_checkpoint = tf.train.latest_checkpoint(model_dir)
        initial_epoch = 0
        if latest_checkpoint:
            # checkpoint.restore must be within a strategy.scope() so that optimizer
            # slot variables are mirrored.
            checkpoint.restore(latest_checkpoint)
            logging.info('Loaded checkpoint %s', latest_checkpoint)
            initial_epoch = optimizer.iterations.numpy() // steps_per_epoch

    # Create summary writers
    train_summary_writer = tf.summary.create_file_writer(
        os.path.join(model_dir, 'summaries/train'))
    test_summary_writer = tf.summary.create_file_writer(
        os.path.join(model_dir, 'summaries/test'))

    @tf.function
    def train_step(iterator):
        """Training StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            with tf.GradientTape() as tape:
                predictions = model(images, training=True)
                if _USE_BFLOAT16:
                    predictions = tf.cast(predictions, tf.float32)

                # Loss calculations.
                #
                # Part 1: Prediction loss.
                prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
                    labels, predictions)
                loss1 = tf.reduce_mean(prediction_loss)
                # Part 2: Model weights regularization
                loss2 = tf.reduce_sum(model.losses)

                # Scale the loss given the TPUStrategy will reduce sum all gradients.
                loss = loss1 + loss2
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            training_loss.update_state(loss)
            training_accuracy.update_state(labels, predictions)

        strategy.experimental_run_v2(step_fn, args=(next(iterator), ))

    @tf.function
    def test_step(iterator):
        """Evaluation StepFn."""
        def step_fn(inputs):
            images, labels = inputs
            predictions = model(images, training=False)
            if _USE_BFLOAT16:
                predictions = tf.cast(predictions, tf.float32)
            loss = tf.keras.losses.sparse_categorical_crossentropy(
                labels, predictions)
            loss = safe_mean(loss)
            test_loss.update_state(loss)
            test_accuracy.update_state(labels, predictions)

        strategy.experimental_run_v2(step_fn, args=(next(iterator), ))

    train_iterator = iter(train_dataset)
    for epoch in range(initial_epoch, FLAGS.num_epochs):
        logging.info('Starting to run epoch: %s', epoch)
        with train_summary_writer.as_default():
            for step in range(steps_per_epoch):
                if step % 20 == 0:
                    logging.info('Running step %s in epoch %s', step, epoch)
                train_step(train_iterator)
            tf.summary.scalar('loss',
                              training_loss.result(),
                              step=optimizer.iterations)
            tf.summary.scalar('accuracy',
                              training_accuracy.result(),
                              step=optimizer.iterations)
            logging.info('Training loss: %s, accuracy: %s%%',
                         round(training_loss.result(), 4),
                         round(training_accuracy.result() * 100, 2))
            training_loss.reset_states()
            training_accuracy.reset_states()

        with test_summary_writer.as_default():
            test_iterator = iter(test_dataset)
            for step in range(steps_per_eval):
                if step % 20 == 0:
                    logging.info('Starting to run eval step %s of epoch: %s',
                                 step, epoch)
                test_step(test_iterator)
            tf.summary.scalar('loss',
                              test_loss.result(),
                              step=optimizer.iterations)
            tf.summary.scalar('accuracy',
                              test_accuracy.result(),
                              step=optimizer.iterations)
            logging.info('Test loss: %s, accuracy: %s%%',
                         round(test_loss.result(), 4),
                         round(test_accuracy.result() * 100, 2))
            test_loss.reset_states()
            test_accuracy.reset_states()

        checkpoint_name = checkpoint.save(os.path.join(model_dir,
                                                       'checkpoint'))
        logging.info('Saved checkpoint to %s', checkpoint_name)
コード例 #6
0
  def test_keras_single_step(self):
    resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu='')
    tf.contrib.distribute.initialize_tpu_system(resolver)
    strategy = tf.contrib.distribute.TPUStrategy(resolver)
    np.random.seed(0)
    tf.set_random_seed(0)

    def input_fn():
      batch_size = 1024
      images = np.random.randn(batch_size, *IMAGE_SHAPE).astype(np.float32)
      labels = np.random.randint(
          0, NUM_CLASSES, size=batch_size).astype(np.float32)

      ds = tf.data.Dataset.from_tensor_slices((images, labels))
      ds = ds.map(lambda im, labels: (tf.cast(im, tf.bfloat16), labels))
      ds = ds.repeat()
      ds = ds.batch(batch_size, drop_remainder=True)
      return ds

    with strategy.scope():
      model = resnet_model.ResNet50(num_classes=NUM_CLASSES)

      model.compile(
          optimizer=gradient_descent.SGD(
              learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True),
          loss='sparse_categorical_crossentropy')

      # Reinitialize layers with known weights.
      # TODO(power) -- figure out a way to force deterministic initialization
      all_weights = []
      for w in model.get_weights():
        if len(w.shape) == 4:
          scale = np.sqrt(2.0 / (w.shape[0] * w.shape[1] * w.shape[-2]))
          all_weights.append((np.random.random_sample(w.shape) - 0.5) * scale)
        elif len(w.shape) == 2:
          scale = np.sqrt(2.0 / np.prod(w.shape))
          all_weights.append((np.random.random_sample(w.shape) - 0.5) * scale)
        else:
          all_weights.append(np.zeros(w.shape))
      model.set_weights(all_weights)

    lr_schedule_cb = LearningRateBatchScheduler(
        schedule=learning_rate_schedule_wrapper(1))
    training_callbacks = [
        lr_schedule_cb,
    ]

    model.fit(
        input_fn(),
        epochs=90,
        steps_per_epoch=1,
        callbacks=training_callbacks,
        verbose=0)

    weights = model.get_weights()
    golden_weights = [
        (-0.0091566, 0.944489),
        (0.0, 0.0),
        (0.0, 0.0),
        (-0.000772487, 1.4831e-05),
        (110.196, 611.292),
    ]
    try:
      for w, gw in zip(weights, golden_weights):
        assert np.allclose(w.mean(), gw[0])
        assert np.allclose(np.var(w), gw[1])
    except:
      for w in weights:
        tf.logging.info('%s %s', w.mean(), np.var(w))
      raise
コード例 #7
0
def main(unused_argv):
    assert FLAGS.data is not None, 'Provide training data path via --data.'
    tf.enable_v2_behavior()
    tf.compat.v1.disable_eager_execution()  # todo

    batch_size = FLAGS.num_cores * PER_CORE_BATCH_SIZE

    training_steps_per_epoch = FLAGS.steps_per_epoch or (int(
        APPROX_IMAGENET_TRAINING_IMAGES // batch_size))
    validation_steps = int(
        math.ceil(1.0 * IMAGENET_VALIDATION_IMAGES / batch_size))

    model_dir = FLAGS.model_dir if FLAGS.model_dir else DEFAULT_MODEL_DIR
    logging.info('Saving tensorboard summaries at %s', model_dir)

    logging.info('Use TPU at %s',
                 FLAGS.tpu if FLAGS.tpu is not None else 'local')
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

    logging.info('Use bfloat16: %s.', USE_BFLOAT16)
    logging.info('Use global batch size: %s.', batch_size)
    logging.info('Enable top 5 accuracy: %s.', FLAGS.eval_top_5_accuracy)
    logging.info('Training model using data in directory "%s".', FLAGS.data)

    with strategy.scope():
        logging.info('Building Keras ResNet-50 model')
        model = resnet_model.ResNet50(num_classes=NUM_CLASSES)
        # model = keras_applications.mobilenet_v2.MobileNetV2(classes=NUM_CLASSES, weights=None)

        logging.info('Compiling model.')
        metrics = ['sparse_categorical_accuracy']

        if FLAGS.eval_top_5_accuracy:
            metrics.append(sparse_top_k_categorical_accuracy)

        model.compile(optimizer=tf.keras.optimizers.SGD(
            learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True),
                      loss='sparse_categorical_crossentropy',
                      metrics=metrics)

    imagenet_train = imagenet_input.ImageNetInput(is_training=True,
                                                  data_dir=FLAGS.data,
                                                  batch_size=batch_size,
                                                  use_bfloat16=USE_BFLOAT16)
    imagenet_eval = imagenet_input.ImageNetInput(is_training=False,
                                                 data_dir=FLAGS.data,
                                                 batch_size=batch_size,
                                                 use_bfloat16=USE_BFLOAT16)

    lr_schedule_cb = LearningRateBatchScheduler(
        schedule=learning_rate_schedule_wrapper(training_steps_per_epoch))
    tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=model_dir)

    training_callbacks = [lr_schedule_cb, tensorboard_cb]

    model.fit(imagenet_train.input_fn(),
              epochs=FLAGS.num_epochs,
              steps_per_epoch=training_steps_per_epoch,
              callbacks=training_callbacks,
              validation_data=imagenet_eval.input_fn(),
              validation_steps=validation_steps,
              validation_freq=5)

    model_saving_utils.save_model(model, model_dir, WEIGHTS_TXT)
コード例 #8
0
def main(argv):
    logging.info('Building Keras ResNet-50 model')
    model = resnet_model.ResNet50(num_classes=NUM_CLASSES)

    if FLAGS.use_tpu:
        logging.info('Converting from CPU to TPU model.')
        resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            tpu=FLAGS.tpu)
        strategy = tf.contrib.tpu.TPUDistributionStrategy(resolver)
        model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy)
        session_master = resolver.master()
    else:
        session_master = ''

    logging.info('Compiling model.')
    model.compile(optimizer=tf.keras.optimizers.SGD(lr=BASE_LEARNING_RATE,
                                                    momentum=0.9,
                                                    nesterov=True),
                  loss='sparse_categorical_crossentropy',
                  metrics=['sparse_categorical_accuracy'])

    callbacks = [LearningRateBatchScheduler(schedule=learning_rate_schedule)]
    if FLAGS.model_dir:
        callbacks.append(
            tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir))

    if FLAGS.data is None:
        training_images = np.random.randn(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE,
                                          3).astype(np.float32)
        training_labels = np.random.randint(NUM_CLASSES,
                                            size=BATCH_SIZE,
                                            dtype=np.int32)
        logging.info('Training model using synthetica data.')
        model.fit(training_images,
                  training_labels,
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  callbacks=callbacks)
        logging.info('Evaluating the model on synthetic data.')
        model.evaluate(training_images, training_labels, verbose=0)
    else:
        imagenet_train = imagenet_input.ImageNetInput(
            is_training=True,
            data_dir=FLAGS.data,
            per_core_batch_size=PER_CORE_BATCH_SIZE)
        logging.info('Training model using real data in directory "%s".',
                     FLAGS.data)
        model.fit(imagenet_train.input_fn,
                  epochs=EPOCHS,
                  steps_per_epoch=TRAINING_STEPS_PER_EPOCH,
                  callbacks=callbacks)

        logging.info('Evaluating the model on the validation dataset.')
        if FLAGS.eval_top_5_accuracy:
            logging.info('Evaluating top 1 and top 5 accuracy using a Python '
                         'generator.')
            # We feed the inputs from a Python generator, so we need to build a single
            # batch for all of the cores, which will be split on TPU.
            imagenet_eval = imagenet_input.ImageNetInput(
                is_training=False,
                data_dir=FLAGS.data,
                per_core_batch_size=BATCH_SIZE)
            score = eval_utils.multi_top_k_accuracy(
                model, imagenet_eval.evaluation_generator(K.get_session()),
                EVAL_STEPS)
        else:
            imagenet_eval = imagenet_input.ImageNetInput(
                is_training=False,
                data_dir=FLAGS.data,
                per_core_batch_size=PER_CORE_BATCH_SIZE)
            score = model.evaluate(imagenet_eval.input_fn,
                                   steps=EVAL_STEPS,
                                   verbose=1)
        print('Evaluation score', score)

        if HAS_H5PY:
            weights_file = os.path.join(
                FLAGS.model_dir if FLAGS.model_dir else '/tmp', WEIGHTS_TXT)
            logging.info('Save weights into %s', weights_file)
            model.save_weights(weights_file, overwrite=True)