Exemple #1
0
    def callableForTestUnmatchedModelFile(model, test_obj, train_ds, num_epoch,
                                          steps, strategy, saving_filepath,
                                          **kwargs):

        # The saving_filepath shouldn't exist at the beginning.
        test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))

        model.fit(x=train_ds,
                  epochs=num_epoch,
                  steps_per_epoch=steps,
                  callbacks=[
                      callbacks.ModelCheckpoint(filepath=saving_filepath,
                                                save_weights_only=True)
                  ])

        (train_ds, _), (_, _) = testing_utils.get_test_data(train_samples=10,
                                                            test_samples=10,
                                                            input_shape=(3, ),
                                                            num_classes=2)

        # Switch to a model of different structure.
        with strategy.scope():
            model = keras.models.Sequential()
            model.add(keras.layers.Dense(5, input_dim=3, activation='relu'))
            model.add(keras.layers.Dense(2, activation='softmax'))
            model.compile(loss='categorical_crossentropy',
                          optimizer='rmsprop',
                          metrics=['acc'])

        test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))

        if saving_filepath.endswith('.tf'):
            test_obj.skipTest(
                'Loading mismatched TF checkpoint would cause Fatal '
                'Python error: Aborted. Skipping.')

        # Unmatched format. Should raise ValueError.
        with test_obj.assertRaisesRegexp(ValueError,
                                         'Error loading file from'):
            model.fit(x=train_ds,
                      epochs=num_epoch,
                      batch_size=8,
                      callbacks=[
                          callbacks.ModelCheckpoint(
                              filepath=saving_filepath,
                              save_weights_only=True,
                              load_weights_on_restart=True)
                      ])
Exemple #2
0
def create_callbacks(early_stopping, model_checkpoint, reduce_lr_on_plateau,
                     tensor_board):
    '''
    Создание списка callbacks

    :param early_stopping: остановка обучения, если параметр 'monitor' не меняется в течении 'patience' эпох
    :param model_checkpoint:  сохранение весов сети с лучшим показателем параметра 'monitor'
    :param reduce_lr_on_plateau: уменьшение learning rate в процессе обучения
    :param tensor_board:
    :return:
    '''
    callbacks_list = []

    # if early_stopping == True:
    #     callbacks_list.append(callbacks.EarlyStopping(monitor='val_acc', patience=7))

    if model_checkpoint == True:
        callbacks_list.append(
            callbacks.ModelCheckpoint(
                filepath=
                'weight_checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
                monitor='val_loss',
                save_best_only=True))

    if reduce_lr_on_plateau == True:
        callbacks_list.append(
            callbacks.ReduceLROnPlateau(monitor='val_loss',
                                        factor=0.1,
                                        patience=10))

    # if tensor_board == True:
    #     callbacks_list.append(callbacks.TensorBoard(log_dir='log_dir', histogram_freq=1))

    return callbacks_list
Exemple #3
0
    def callableForTestModelRestoreCallback(model, test_obj, train_ds,
                                            num_epoch, steps, strategy,
                                            saving_filepath):

        saving_filepath, history_after_one_more_epoch = \
            KerasMultiWorkerCallbackTest.initialFitting(
                test_obj, model, train_ds, num_epoch, steps, saving_filepath)

        # The model should get restored to the weights previously saved, by
        # adding a ModelCheckpoint callback (which results in a
        # _ModelRestoreCallback being added), with load_weights_on_restart=True.
        history_after_model_restoring_and_one_more_epoch = model.fit(
            x=train_ds,
            epochs=1,
            steps_per_epoch=steps,
            callbacks=[
                callbacks.ModelCheckpoint(filepath=saving_filepath,
                                          save_weights_only=True,
                                          load_weights_on_restart=True)
            ])

        # Asserting the history one epoch after initial fitting and one epoch after
        # restoring are closed.
        test_obj.assertAllClose(
            history_after_one_more_epoch.history,
            history_after_model_restoring_and_one_more_epoch.history)

        history_one_more_epoch_without_model_restoring = model.fit(
            x=train_ds, epochs=1, steps_per_epoch=steps)

        # Ensuring training for another epoch gives different result.
        test_obj.assertNotAllClose(
            history_after_model_restoring_and_one_more_epoch.history,
            history_one_more_epoch_without_model_restoring.history)
Exemple #4
0
    def fit(self,
            trdst,
            valdst,
            nb_epochs,
            steps_per_epoch,
            batch_size=100,
            use_wn=False):

        opt = AdamWithWeightnorm() if use_wn else optimizers.Adam()
        self.model.compile(optimizer=opt, loss='mse', metrics=[psnr_tf])

        log_dir = os.path.join(self.log_dir, self.model_name)
        callback_list = [
            callbacks.ModelCheckpoint(self.weights_path,
                                      save_best_only=False,
                                      save_weights_only=True,
                                      verbose=1),
            callbacks.LearningRateScheduler(
                lambda e: self.lr_schedule(e, nb_epochs), verbose=0),
            callbacks.TensorBoard(log_dir=log_dir,
                                  histogram_freq=1,
                                  write_graph=True)
        ]

        print('Training model : %s' % (self.model_name))

        self.model.fit(
            x=trdst.batch(batch_size).prefetch(AUTOTUNE),
            epochs=nb_epochs,
            callbacks=callback_list,
            validation_data=valdst.batch(batch_size).prefetch(AUTOTUNE),
            steps_per_epoch=steps_per_epoch,
            verbose=1)

        return self
Exemple #5
0
def train_model(model_name,weights_save_path):
    with h5py.File(hdf5_path + mode_list[0] + "_2D_data_" + str(index) + ".h5", "r") as f:
        train_img = f["batch_patches"].value[..., 0:6]  # (num,patch_h,patch_w,6)
        train_gt = f["batch_patches"].value[..., 6]  # (num,patch_h,patch_w)
    with h5py.File(hdf5_path + mode_list[1] + "_2D_data_" + str(index) + ".h5", "r") as f:
        val_img = f["batch_patches"].value[..., 0:6]  # (num,patch_h,patch_w,6)
        val_gt = f["batch_patches"].value[..., 6]  # (num,patch_h,patch_w)
    print(train_img.shape)
    if model_name == "Unet":
        model = unet.unet(input_height=patch_h, input_width=patch_w)
    elif model_name == "Nonlocal dsv Unet":
        model = unet_nonlocal.unet_nonlocal(input_height=patch_h, input_width=patch_w)
    else:
        model = unet_nonlocal.res_unet_nonlocal(input_height=patch_h, input_width=patch_w)
    cp = [callbacks.EarlyStopping(monitor='val_dice',
                                  patience=10,
                                  mode='max'),
          callbacks.ModelCheckpoint(filepath=weights_save_path,
                                    monitor='val_dice',
                                    save_best_only=True,
                                    save_weights_only=True,
                                    mode='max',
                                    verbose=1)]
    print("Training " + model_name + " Model")
    if load_weights == True:
        print("Loading " + model_name + " Model Weights")
        model.load_weights(weights_save_path)
    history = model.fit(train_img, train_gt[..., None], batch_size=batch_size, epochs=epochs,
                            validation_data=(val_img, val_gt[..., None]),
                            shuffle=True, callbacks=cp)
    eval_metrics = model.evaluate(val_img, val_gt[..., None])
    visualize_loss(history)
    return eval_metrics
    def testCheckpointExists(self, file_format, save_weights_only):
        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(64, 2)
        model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
        saving_dir = self.get_temp_dir()
        saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
        callbacks_list = [
            callbacks.ModelCheckpoint(filepath=saving_filepath,
                                      save_weights_only=save_weights_only)
        ]
        self.assertFalse(file_io.file_exists_v2(saving_filepath))

        try:
            model.fit(x=train_ds,
                      epochs=2,
                      steps_per_epoch=2,
                      callbacks=callbacks_list)
        except NotFoundError as e:
            if 'Failed to create a NewWriteableFile' in e.message:
                self.skipTest(
                    'b/138941852, path not found error in Windows py35.')
        tf_saved_model_exists = file_io.file_exists_v2(saving_filepath)
        tf_weights_only_checkpoint_exists = file_io.file_exists_v2(
            saving_filepath + '.index')
        self.assertTrue(tf_saved_model_exists
                        or tf_weights_only_checkpoint_exists)
Exemple #7
0
  def test_validate_callbacks_predefined_callbacks(self):
    supported_predefined_callbacks = [
        callbacks.TensorBoard(),
        callbacks.CSVLogger(filename='./log.csv'),
        callbacks.EarlyStopping(),
        callbacks.ModelCheckpoint(filepath='./checkpoint'),
        callbacks.TerminateOnNaN(),
        callbacks.ProgbarLogger(),
        callbacks.History(),
        callbacks.RemoteMonitor()
    ]

    distributed_training_utils.validate_callbacks(
        supported_predefined_callbacks, adam.Adam())

    unsupported_predefined_callbacks = [
        callbacks.ReduceLROnPlateau(),
        callbacks.LearningRateScheduler(schedule=lambda epoch: 0.001)
    ]

    for callback in unsupported_predefined_callbacks:
      with self.assertRaisesRegex(ValueError,
                                  'You must specify a Keras Optimizer V2'):
        distributed_training_utils.validate_callbacks([callback],
                                                      v1_adam.AdamOptimizer())
Exemple #8
0
    def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
            model, test_obj, train_ds, num_epoch, steps, strategy,
            saving_filepath, **kwargs):
        # Incorporate type/index information and thread id in saving_filepath to
        # ensure every worker has a unique path. Note that in normal use case the
        # saving_filepath will be the same for all workers, but we use different
        # ones here just to test out chief saves checkpoint but non-chief doesn't.

        # TODO(b/134551335): Must save to hdf5 until bug with copying
        # MirroredVariables is resolved.
        saving_filepath = os.path.join(
            test_obj.get_temp_dir(), 'checkpoint_%s_%d.h5' %
            (test_base.get_task_type(), test_base.get_task_index()))

        # The saving_filepath shouldn't exist at the beginning (as it's unique).
        test_obj.assertFalse(os.path.exists(saving_filepath))

        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

        # If it's chief, the model should be saved; if not, the model shouldn't.
        test_obj.assertEqual(os.path.exists(saving_filepath),
                             test_base.is_chief())
Exemple #9
0
    def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
            model, test_obj, train_ds, num_epoch, steps, strategy,
            saving_filepath, **kwargs):

        extension = os.path.splitext(saving_filepath)[1]

        # Incorporate type/index information and thread id in saving_filepath to
        # ensure every worker has a unique path. Note that in normal use case the
        # saving_filepath will be the same for all workers, but we use different
        # ones here just to test out chief saves checkpoint but non-chief doesn't.

        saving_filepath = os.path.join(
            test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
            (test_base.get_task_type(), test_base.get_task_index(), extension))

        # The saving_filepath shouldn't exist at the beginning (as it's unique).
        test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))

        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

        # If it's chief, the model should be saved; if not, the model shouldn't.
        test_obj.assertEqual(training_state.checkpoint_exists(saving_filepath),
                             test_base.is_chief())
Exemple #10
0
    def initialFitting(test_obj, model, train_ds, num_epoch, steps,
                       saving_filepath):
        # The saving_filepath shouldn't exist at the beginning.
        test_obj.assertFalse(os.path.exists(saving_filepath))

        model.fit(x=train_ds,
                  epochs=num_epoch,
                  steps_per_epoch=steps,
                  callbacks=[
                      callbacks.ModelCheckpoint(filepath=saving_filepath,
                                                save_weights_only=True)
                  ])

        # The saving_filepath should exist after fitting with callback. Both chief
        # and non-chief worker should both see it exists (which was saved only by
        # chief).
        test_obj.assertTrue(os.path.exists(saving_filepath))

        history_after_one_more_epoch = model.fit(x=train_ds,
                                                 epochs=1,
                                                 steps_per_epoch=steps)

        # The saving_filepath should continue to exist (if it did) after fitting
        # without callback.
        test_obj.assertTrue(os.path.exists(saving_filepath))

        return saving_filepath, history_after_one_more_epoch
Exemple #11
0
def _create_best_checkpoint(
        artifact_dir: str, cfg_services: dict,
        metrics_names: List[str]) -> callbacks.ModelCheckpoint:
    """Create a callback that saves the best model.

    Args:
        artifact_dir: str, path to artifact directory.
        cfg_services: dict, services subsection of config.
        metrics_names: list[str], 'metrics' names.

    Returns:
        ModelCheckpoint, callback that saves the best model.
    """
    checkpoint_params = cfg_services["best_checkpoint"]
    checkpoint_params["monitor"] = _force_monitor_to_mode(
        checkpoint_params["monitor"], metrics_names, True, "best_checkpoint")
    filepath = get_best_checkpoint_filepath(artifact_dir)
    return callbacks.ModelCheckpoint(
        filepath=filepath,
        monitor=checkpoint_params["monitor"],
        mode=checkpoint_params["mode"],
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
    )
Exemple #12
0
def checkpoint_callback(args):
    try:
        os.makedirs(args.checkpoint_dir)
    except OSError:
        pass
    timestamp = int(time.time())
    pattern = "weights-%i-{epoch:05d}.h5" % timestamp
    filepath = os.path.join(args.checkpoint_dir, pattern)
    return callbacks.ModelCheckpoint(filepath)
    def callableForTestUnmatchedModelFile(model, test_obj, train_ds, num_epoch,
                                          steps, strategy, saving_filepath):

        # The saving_filepath shouldn't exist at the beginning.
        test_obj.assertFalse(os.path.exists(saving_filepath))

        model.fit(x=train_ds,
                  epochs=num_epoch,
                  steps_per_epoch=steps,
                  callbacks=[
                      callbacks.ModelCheckpoint(filepath=saving_filepath,
                                                save_weights_only=True)
                  ])

        (train_ds, _), (_, _) = testing_utils.get_test_data(train_samples=10,
                                                            test_samples=10,
                                                            input_shape=(3, ),
                                                            num_classes=2)

        # Switch to a model of different structure.
        with strategy.scope():
            model = keras.models.Sequential()
            model.add(keras.layers.Dense(5, input_dim=3, activation='relu'))
            model.add(keras.layers.Dense(2, activation='softmax'))
            model.compile(loss='categorical_crossentropy',
                          optimizer='rmsprop',
                          metrics=['acc'])

        # TODO(b/129779608): Fix the flakiness of the following check.
        # test_obj.assertTrue(os.path.exists(saving_filepath))

        # Unmatched format. Should raise ValueError.
        with test_obj.assertRaisesRegexp(ValueError,
                                         'Error loading file from'):
            model.fit(x=train_ds,
                      epochs=num_epoch,
                      batch_size=8,
                      callbacks=[
                          callbacks.ModelCheckpoint(
                              filepath=saving_filepath,
                              save_weights_only=True,
                              load_weights_on_restart=True)
                      ])
        def proc_model_checkpoint_works_with_same_file_path(
                test_obj, saving_filepath):
            if multi_process_runner.is_oss():
                test_obj.skipTest('TODO(b/170838633): Failing in OSS')
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 4

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))
            bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')

            try:
                model.fit(
                    x=train_ds,
                    epochs=num_epoch,
                    steps_per_epoch=steps,
                    callbacks=[
                        callbacks.ModelCheckpoint(filepath=saving_filepath),
                        callbacks.BackupAndRestore(backup_dir=bar_dir),
                        InterruptingCallback()
                    ])
            except RuntimeError as e:
                if 'Interrupting!' not in str(e):
                    raise

            multi_process_runner.get_barrier().wait()
            backup_filepath = os.path.join(bar_dir, 'chief', 'checkpoint')
            test_obj.assertTrue(file_io.file_exists_v2(backup_filepath))
            test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(filepath=saving_filepath),
                          callbacks.BackupAndRestore(backup_dir=bar_dir),
                          AssertCallback()
                      ])
            multi_process_runner.get_barrier().wait()
            test_obj.assertFalse(file_io.file_exists_v2(backup_filepath))
            test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))
Exemple #15
0
def define_callbacks(output, batch_size):
    csv_logger = callbacks.CSVLogger(join(output, 'training.log'))
    earlystop = callbacks.EarlyStopping(monitor='val_loss', patience=2)
    tensorboard = callbacks.TensorBoard(batch_size=batch_size)
    fpath = join(
        output,
        'weights.{epoch:02d}-{loss:.2f}-{acc:.2f}-{val_loss:.2f}-{val_acc:.2f}.hdf5'
    )
    cp_cb = callbacks.ModelCheckpoint(filepath=fpath,
                                      monitor='val_loss',
                                      save_best_only=True)
    return [csv_logger, earlystop, tensorboard, cp_cb]
Exemple #16
0
def train():
    """Runs the training."""
    print('Starting training process...', flush=True)
    # Basic training settings.
    model = SimpleLightcurveCnn()
    database = ToiDatabase()
    # database.batch_size = 100  # Reducing the batch size may help if you are running out of memory.
    epochs_to_run = 1000
    trial_name = 'baseline'
    logs_directory = 'logs'

    # Setup logging.
    datetime_string = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    trial_directory = os.path.join(logs_directory,
                                   f'{trial_name} {datetime_string}')
    tensorboard_callback = callbacks.TensorBoard(log_dir=trial_directory)
    database.trial_directory = trial_directory
    model_save_path = os.path.join(trial_directory, 'model.ckpt')
    model_checkpoint_callback = callbacks.ModelCheckpoint(
        model_save_path, save_weights_only=True)

    # Prepare training data and metrics.
    training_dataset, validation_dataset = database.generate_datasets()
    optimizer = tf.optimizers.Adam(learning_rate=1e-4,
                                   beta_1=0.99,
                                   beta_2=0.9999)
    loss_metric = BinaryCrossentropy(name='Loss')
    metrics = [
        tf.metrics.BinaryAccuracy(name='Accuracy'),
        tf.metrics.Precision(name='Precision'),
        tf.metrics.Recall(name='Recall'),
        tf.metrics.SpecificityAtSensitivity(
            0.9, name='Specificity_at_90_percent_sensitivity'),
        tf.metrics.SensitivityAtSpecificity(
            0.9, name='Sensitivity_at_90_percent_specificity')
    ]

    # Compile and train model.
    model.compile(optimizer=optimizer, loss=loss_metric, metrics=metrics)
    try:
        model.fit(training_dataset,
                  epochs=epochs_to_run,
                  validation_data=validation_dataset,
                  callbacks=[tensorboard_callback, model_checkpoint_callback],
                  steps_per_epoch=5000,
                  validation_steps=500)
    except KeyboardInterrupt:
        print('Interrupted. Saving model before quitting...', flush=True)
    finally:
        model.save_weights(model_save_path)
    print('Training done.', flush=True)
Exemple #17
0
def creat_net(train_generator,validation_generator,batch_size,image_lengh,image_width):
    model = Sequential([
        Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=(image_lengh,image_width, 3)),
        MaxPooling2D(pool_size=2),
        Conv2D(filters=64, kernel_size=3, padding='same', activation='relu'),
        MaxPooling2D(pool_size=2),
        Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'),
        MaxPooling2D(pool_size=2),
        Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'),
        MaxPooling2D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(4, activation='softmax')
    ])
    # Reduce=ReduceLROnPlateau(monitor='val_accuracy',
    #                          factor=0.1,
    #                          patience=2,
    #                          verbose=1,
    #                          mode='auto',
    #                          epsilon=0.0001,
    #                          cooldown=0,
    #                          min_lr=0)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #保存最优模型
    filepath = './模型/cnn_weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5'
    checkpoint = callbacks.ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

    model.fit_generator(train_generator, epochs=20, steps_per_epoch=1707//batch_size,validation_data=validation_generator,
                    validation_steps=264//batch_size, callbacks=[checkpoint])#,Reduce])
    #绘制误差和准确率曲线
    loss = model.history.history['loss']
    val_loss = model.history.history['val_loss']
    epoches = range(1, len(loss) + 1)
    acc = model.history.history['accuracy']
    val_acc = model.history.history['val_accuracy']
    plt.subplot(121)
    plt.plot(epoches, loss, 'bo', label='training_loss')
    plt.plot(epoches, val_loss, 'r', label='validation_loss')
    plt.xlabel('epoches')
    plt.ylabel('loss')
    plt.title('losses of train and val')
    plt.legend()
    plt.subplot(122)
    plt.plot(epoches, acc, 'bo', label='training_acc')
    plt.plot(epoches, val_acc, 'r', label='validation_acc')
    plt.xlabel('epoches')
    plt.ylabel('acc')
    plt.title('accuracy of train and val')
    plt.legend()
    plt.show()
Exemple #18
0
        def proc_model_checkpoint_works_with_same_file_path(
                test_obj, saving_filepath):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 4

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists(saving_filepath))
            bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')

            try:
                model.fit(
                    x=train_ds,
                    epochs=num_epoch,
                    steps_per_epoch=steps,
                    callbacks=[
                        callbacks.ModelCheckpoint(filepath=saving_filepath),
                        callbacks.BackupAndRestore(backup_dir=bar_dir),
                        InterruptingCallback()
                    ])
            except RuntimeError as e:
                if 'Interrupting!' not in str(e):
                    raise

            backup_filepath = os.path.join(bar_dir, 'checkpoint')
            test_obj.assertTrue(file_io.file_exists(backup_filepath))
            test_obj.assertTrue(file_io.file_exists(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(filepath=saving_filepath),
                          callbacks.BackupAndRestore(backup_dir=bar_dir),
                          AssertCallback()
                      ])
            test_obj.assertFalse(file_io.file_exists(backup_filepath))
            test_obj.assertTrue(file_io.file_exists(saving_filepath))
Exemple #19
0
    def proc_model_checkpoint_works_with_same_file_path(
        test_obj, saving_filepath):
      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
      num_epoch = 2

      # The saving_filepath shouldn't exist at the beginning (as it's unique).
      test_obj.assertFalse(file_io.file_exists(saving_filepath))

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

      test_obj.assertTrue(file_io.file_exists(saving_filepath))
  def setUp(self):
    super(CallbackFallbackTest, self).setUp()
    self.batch_size = 5
    self.numpy_input = np.zeros((50, 10))
    self.numpy_target = np.ones(50)
    self.tensor_input = constant_op.constant(2.0, shape=(50, 10))
    self.tensor_target = array_ops.ones((50,))
    self.dataset_input = dataset_ops.DatasetV2.from_tensor_slices(
        (self.numpy_input, self.numpy_target)).shuffle(50).batch(
            self.batch_size)

    def generator():
      yield (np.zeros((self.batch_size, 10)), np.ones(self.batch_size))
    self.generator_input = generator()
    self.sequence_input = TestSequence(batch_size=self.batch_size,
                                       feature_shape=10)

    self.fallback_ckeckpoint_cb = cbks.ModelCheckpoint(
        self.get_temp_dir(), save_freq=10)
    self.normal_checkpoint_cb = cbks.ModelCheckpoint(
        self.get_temp_dir(), save_freq='epoch')
    self.fallback_tensorboard_cb = cbks.TensorBoard(update_freq=10)
    self.normal_tensorboard_cb = cbks.TensorBoard(update_freq='batch')
    self.unaffected_cb = cbks.CSVLogger(self.get_temp_dir())
  def testCheckpointExists(self, file_format, save_weights_only):
    train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(64, 2)
    model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
    saving_dir = self.get_temp_dir()
    saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
    callbacks_list = [
        callbacks.ModelCheckpoint(
            filepath=saving_filepath, save_weights_only=save_weights_only)
    ]
    self.assertFalse(training_state.checkpoint_exists(saving_filepath))

    model.fit(x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
    self.assertTrue(training_state.checkpoint_exists(saving_filepath))
    self.assertTrue(
        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
Exemple #22
0
        def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
                test_obj, file_format):

            model, saving_filepath, train_ds, steps = _model_setup(
                test_obj, file_format)
            num_epoch = 2
            extension = os.path.splitext(saving_filepath)[1]

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves checkpoint but non-chief doesn't.
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(),
                'checkpoint_%s_%d%s' % (test_base.get_task_type(),
                                        test_base.get_task_index(), extension))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(
                training_state.checkpoint_exists(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      validation_data=train_ds,
                      validation_steps=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(
                              filepath=saving_filepath,
                              save_weights_only=save_weights_only)
                      ])

            # If it's chief, the model should be saved; if not, the model shouldn't.
            test_obj.assertEqual(
                training_state.checkpoint_exists(saving_filepath),
                test_base.is_chief())

            # If it's chief, the model should be saved (`write_filepath` should
            # simply return `saving_filepath`); if not, i.e. for non-chief workers,
            # the temporary path generated by `write_filepath` should no longer
            # contain the checkpoint that has been deleted.
            test_obj.assertEqual(
                training_state.checkpoint_exists(
                    distributed_file_utils.write_filepath(
                        saving_filepath, model._distribution_strategy)),
                test_base.is_chief())
Exemple #23
0
def _create_resume_checkpoint(artifact_dir: str) -> callbacks.ModelCheckpoint:
    """Create a callback that saves the model every epoch.

    Args:
        artifact_dir: str, path to artifact directory.

    Returns:
        ModelCheckpoint, callback that saves the model every epoch.
    """
    filepath = get_resume_checkpoints_filepath(artifact_dir)
    return callbacks.ModelCheckpoint(
        filepath=filepath,
        monitor="val_loss",
        mode="min",
        verbose=0,
        save_best_only=False,
        save_weights_only=True,
    )
Exemple #24
0
 def testCheckpointExists(self, file_format, save_weights_only):
   with self.cached_session():
     train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(64, 2)
     model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
     saving_dir = self.get_temp_dir()
     saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
     callbacks_list = [
         callbacks.ModelCheckpoint(
             filepath=saving_filepath, save_weights_only=save_weights_only)
     ]
     self.assertFalse(file_io.file_exists_v2(saving_filepath))
     model.fit(
         x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
     tf_saved_model_exists = file_io.file_exists_v2(saving_filepath)
     tf_weights_only_checkpoint_exists = file_io.file_exists_v2(
         saving_filepath + '.index')
     self.assertTrue(
         tf_saved_model_exists or tf_weights_only_checkpoint_exists)
def callb(path_checkpoint):
    callback_checkpoint = tf_cb.ModelCheckpoint(
        filepath=path_checkpoint, monitor = 'loss', verbose=1,
        save_weights_only=True, save_best_only=True)

    callback_earlystopping = tf_cb.EarlyStopping(monitor='loss',
                                                 patience=20, verbose=1)
    callback_reduce_lr = tf_cb.ReduceLROnPlateau(monitor='loss',
                                                 factor=0.98,
                                                 min_lr=0.3e-4,
                                                 patience=0,
                                                 verbose=1)
    callBacks = [
        callback_checkpoint,
        callback_earlystopping,
        callback_reduce_lr
    ]
    return callBacks
  def callableForTestBackupModelNotRemovedIfInterrupted(model, test_obj,
                                                        train_ds, num_epoch,
                                                        steps, strategy,
                                                        saving_filepath,
                                                        **kwargs):

    # `barrier` object needs to be passed in from parent
    # thread so both threads refer to the same object.
    barrier = kwargs['barrier']

    num_epoch = 4

    # Testing the backup filepath `multi_worker_training_state` uses.
    _, backup_filepath = training_state._get_backup_filepath(saving_filepath)

    # The backup_filepath shouldn't exist at the beginning.
    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))

    # Callback to interrupt in the middle of training.
    class InterruptingCallback(callbacks.Callback):

      def on_epoch_begin(self, epoch, logs=None):
        if epoch == 2:
          raise RuntimeError('Interrupting!')

    try:
      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[
              callbacks.ModelCheckpoint(
                  filepath=saving_filepath, save_weights_only=True),
              InterruptingCallback()
          ])
    except RuntimeError as e:
      if 'Interrupting!' not in e.message:
        raise

    # Sync on the two threads.
    barrier.wait()

    # The back up file should exist after interruption of `model.fit()`.
    test_obj.assertTrue(training_state.checkpoint_exists(backup_filepath))
Exemple #27
0
    def callableForTestBackupModelRemoved(model, test_obj, train_ds, num_epoch,
                                          steps, strategy, saving_filepath,
                                          **kwargs):

        # `barrier` object needs to be passed in from parent
        # thread so both threads refer to the same object.
        barrier = kwargs['barrier']

        num_epoch = 3

        # Testing the backup filepath `multi_worker_training_state` uses.
        _, backup_filepath = training_state._get_backup_filepath(
            saving_filepath)

        # The backup_filepath shouldn't exist at the beginning.
        test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))

        # Callback to verify that the backup file exists in the middle of training.
        class BackupFilepathVerifyingCallback(callbacks.Callback):
            def on_epoch_begin(self, epoch, logs=None):
                if epoch > 1:
                    # Asserting that after the first two epochs, the backup file should
                    # exist.
                    test_obj.assertTrue(
                        training_state.checkpoint_exists(backup_filepath))

        model.fit(x=train_ds,
                  epochs=num_epoch,
                  steps_per_epoch=steps,
                  callbacks=[
                      callbacks.ModelCheckpoint(filepath=saving_filepath,
                                                save_weights_only=True),
                      BackupFilepathVerifyingCallback()
                  ])

        # Sync on the two threads so we make sure the backup file is removed before
        # we move on.
        barrier.wait()

        # The back up file should not exist at successful exit of `model.fit()`.
        test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
Exemple #28
0
    def callableForTestIntermediateDirForFTAreRemoved(model, test_obj,
                                                      train_ds, num_epoch,
                                                      steps, strategy,
                                                      saving_filepath,
                                                      **kwargs):
        """Testing that the temporary directory are removed.

    Some temporary directories are created for the purpose of fault tolerance.
    This test ensures that such directories should have been removed at the time
    `model.fit()` finishes successfully.
    """

        # `threading_local` and `barrier` objects have to be passed in from parent
        # thread so both threads refer to the same object.
        threading_local = kwargs['threading_local']
        barrier = kwargs['barrier']

        # Two threads will each has one copy of `temp_dirs_supposed_to_be_removed`
        # list.
        threading_local.temp_dirs_supposed_to_be_removed = []

        callbacks_list = [
            callbacks.ModelCheckpoint(filepath=saving_filepath,
                                      save_weights_only=True,
                                      load_weights_on_restart=True),
        ]

        # Keep the references to the real function objects.
        real_os_path_join = os.path.join
        real_tempfile_mkdtemp = tempfile.mkdtemp

        # Make a `os.path.join` wrapper, which will be patched onto the real
        # function, so the temporary directories can be tracked.
        def wrapper_os_path_join(path, *paths):
            join_result = real_os_path_join(path, *paths)
            if len(paths) == 1 and paths[0] == 'backup':
                threading_local.temp_dirs_supposed_to_be_removed.append(
                    join_result)
            return join_result

        # Likewise for `tempfile.mkdtemp`.
        def wrapper_tempfile_mkdtemp():
            result = real_tempfile_mkdtemp()
            threading_local.temp_dirs_supposed_to_be_removed.append(result)
            return result

        # Now the two threads must sync here: if they are out of sync, one thread
        # can go ahead and patch `os.path.join` while the other has not even
        # assigned the real `os.path.join` to `real_os_path_join`. If this happened,
        # the "real" `os.path.join` the slower thread would see is actually the
        # wrapper of the other.
        barrier.wait()

        # Note that `os.path.join` will respect the second patch (there are two
        # patches because of the two threads). Both threads will refer to the same
        # copy of `wrapper_os_path_join` because of the `barrier` preceding
        # `model.fit()`. Likewise for `wrapper_tempfile_mkdtemp`.
        os.path.join = wrapper_os_path_join
        tempfile.mkdtemp = wrapper_tempfile_mkdtemp

        barrier.wait()
        model.fit(x=train_ds,
                  epochs=num_epoch,
                  steps_per_epoch=steps,
                  callbacks=callbacks_list)

        # Sync before un-patching to prevent either thread from accessing the real
        # functions. Also to make sure `model.fit()` is done on both threads (so we
        # can safely assert the directories are removed).
        barrier.wait()
        os.path.join = real_os_path_join
        tempfile.mkdtemp = real_tempfile_mkdtemp

        # There should be directory (names) that are supposed to be removed.
        test_obj.assertTrue(threading_local.temp_dirs_supposed_to_be_removed)
        for temp_dir_supposed_to_be_removed in (
                threading_local.temp_dirs_supposed_to_be_removed):
            # They should have been removed and thus don't exist.
            test_obj.assertFalse(
                os.path.exists(temp_dir_supposed_to_be_removed))
Exemple #29
0
    config.use_assaf = FLAGS.assaf
    print("Augment is : {}".format(int(config.Augment)))
    optimizer = tf.compat.v2.optimizers.Adam(beta_1=0.99)
    loader = Loader(batch_size=FLAGS.batch_size)
    Net_OOP.compile(optimizer=optimizer,
                    loss=loss_fn,
                    metrics=['acc', 'loss', 'val_acc', 'val_loss'])
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    if not os.path.exists(FLAGS.model_path):
        os.makedirs(FLAGS.model_path)
    logger = get_logger(os.path.join(FLAGS.log_dir, "train_log"))
    Tensorcallback = callbacks.TensorBoard(FLAGS.log_dir,
                                           write_graph=False,
                                           write_images=False)
    Checkpoint = callbacks.ModelCheckpoint(filepath=FLAGS.model_path +
                                           "/checkpoint.hdf5",
                                           monitor='val_acc',
                                           mode='max',
                                           save_best_only=True)
    Checkpoint.set_model(Net_OOP)
    Tensorcallback.set_model(Net_OOP)
    callbacks = {'tensorboard': Tensorcallback, 'checkpoint': Checkpoint}
    Net_OOP.fit(logger=logger,
                callbacks=callbacks,
                epochs=FLAGS.epochs,
                steps_per_epoch=config.steps_per_epoch,
                val_freq=config.val_freq,
                val_steps=config.validation_steps,
                loader=loader)
Exemple #30
0
        def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
            with test.mock.patch.object(dc, '_run_std_server',
                                        self._make_mock_run_std_server()):
                # Condition variable that blocks the thread that represents the
                # restarted chief.
                cv = kwargs.get('cv', None)
                # `before_restart` is True for the threads that represent the original
                # chief and non-chief worker, and False for threads that represent the
                # restarted chief and non-chief workers.
                before_restart = kwargs['before_restart']
                if kwargs['new_chief']:
                    # `new_chief` is only True for the restarted chief thread. It waits
                    # until non-chief is preempted and restarted to simulate the causality
                    # where chief's restart results from non-chief's failure.
                    cv.acquire()
                    while not hasattr(cv, 'preempted'):
                        cv.wait()
                    cv.release()

                # Model building under strategy scope. Following is the code we expect
                # the user runs on every worker.
                strategy = get_strategy_object(strategy_cls)
                batch_size = 64
                steps = 3
                train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
                with strategy.scope():
                    model = _get_model((28, 28, 1))

                # Function to start a new thread. This will be called twice in the
                # following code: one represents the restart of the non-chief, and one
                # represents the restart of the chief as a result of the restart of the
                # non-chief (so the training can continue in sync).
                def start_new_thread(new_chief=False):
                    new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
                    new_thread_tf_config['cluster']['worker'] = kwargs[
                        'reserved_ports']
                    return self._run_task_in_thread(
                        task_fn=_independent_worker_fn,
                        cluster_spec=None,
                        task_type=None,
                        task_id=None,
                        tf_config=new_thread_tf_config,
                        before_restart=False,
                        cv=cv,
                        new_chief=new_chief)

                if test_base.is_chief() and before_restart:
                    # Chief to start a new thread (that will be blocked by a condition
                    # variable until the non-chief's new thread is started). The thread
                    # for (recovered) chief is started before entering `fit()` because
                    # the original chief thread will eventually hang and be ignored.
                    start_new_thread(new_chief=True)

                try:

                    class CkptSavedEpochAssertingCallback(callbacks.Callback):
                        def __init__(self, test_obj):
                            super(CkptSavedEpochAssertingCallback,
                                  self).__init__()
                            self.test_obj = test_obj

                        def on_epoch_begin(self, epoch, logs=None):
                            # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
                            self.test_obj.assertEqual(
                                self.model._ckpt_saved_epoch is None,
                                epoch == 0)

                    callbacks_list = [
                        callbacks.ModelCheckpoint(
                            filepath=saving_filepath,
                            save_weights_only=True,
                            load_weights_on_restart=True),
                        CkptSavedEpochAssertingCallback(self)
                    ]
                    if before_restart:
                        callbacks_list.append(preemption_callback())

                    self.assertIsNone(model._ckpt_saved_epoch)
                    history = model.fit(x=train_ds,
                                        epochs=num_epoch,
                                        steps_per_epoch=steps,
                                        callbacks=callbacks_list)
                    self.assertIsNone(model._ckpt_saved_epoch)

                    # `history` of the training result is collected to be compared against
                    # each other. It is expected that the training results (loss and
                    # accuracy`) are the same with or without preemption.
                    self._histories.append(history.history)

                except RuntimeError:
                    # pylint: disable=g-assert-in-except
                    self.assertTrue(before_restart)
                    # Reset the barrier so the new threads simulating recovery can
                    # continue.
                    self._barrier._counter = 0
                    self._barrier._flag = False

                    # Now that the non-chief has been preempted, it notifies the thread
                    # that simulates the restarted chief to start so they can be back in
                    # sync.
                    cv.acquire()
                    cv.preempted = True
                    cv.notify()
                    cv.release()

                    # At this point we should discard the original non-chief thread, and
                    # start the new thread that simulates the restarted non-chief, hence
                    # joining the thread and return.
                    self.join_independent_workers([start_new_thread()])
                    return

                # Successful end of a `fit()` call.
                self._successful_thread_ends += 1
                self.assertFalse(before_restart)