コード例 #1
0
    def test_keras_model_checkpoint_callback(self, mock_fit_fn, mock_pin_gpu_fn):
        from horovod.tensorflow.keras.callbacks import BestModelCheckpoint

        def _get_mock_fit_fn(checkpoint_callback_provided):
            def fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks,
                    verbose):
                returned_model_checkpoint_present = False
                model_checkpoint_present = False
                for callback in callbacks:
                    callback.set_model(model)
                    if checkpoint_callback_provided:
                        callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3})
                    else:
                        callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3})

                    if checkpoint_callback_provided and isinstance(callback, BestModelCheckpoint):
                        self.assertIsNotNone(callback.filepath)
                        self.assertTrue(callback.save_best_only)
                        self.assertEqual(callback.monitor, 'binary_crossentropy')
                        returned_model_checkpoint_present = True

                    if not checkpoint_callback_provided and isinstance(callback, tf.keras.callbacks.ModelCheckpoint):
                        self.assertFalse(callback.save_best_only)
                        self.assertFalse(callback.save_best_only)
                        self.assertEqual(callback.monitor, 'val_loss')
                        model_checkpoint_present = True

                if checkpoint_callback_provided:
                    self.assertTrue(returned_model_checkpoint_present)
                    self.assertFalse(model_checkpoint_present)
                else:
                    self.assertFalse(returned_model_checkpoint_present)
                    self.assertTrue(model_checkpoint_present)

                return mock.Mock()

            return fit

        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_model_chekcpoint_callbacks') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = tf.keras.optimizers.SGD(lr=0.1)
                    loss = 'binary_crossentropy'

                    # Test when the checkpoint callback is not set, the correct one is created
                    mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=False)
                    est = hvd.KerasEstimator(
                        backend=backend,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        feature_cols=['features'],
                        label_cols=['y'],
                        batch_size=1,
                        epochs=3,
                        verbose=2)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()

                    # Test if checkpoint call back is correctly set to the model
                    mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=True)
                    checkpoint_callback = BestModelCheckpoint(monitor='binary_crossentropy')
                    est = hvd.KerasEstimator(
                        backend=backend,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        feature_cols=['features'],
                        label_cols=['y'],
                        batch_size=1,
                        epochs=3,
                        verbose=2,
                        checkpoint_callback=checkpoint_callback)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()
コード例 #2
0
def train(
    max_sales: float,
    vocab: Dict[str, List[Any]],
    hp: Hyperparameters,
    work_dir: FlyteDirectory,
    train_df: pyspark.sql.DataFrame,
    working_dir: FlyteDirectory,
):
    print("==============")
    print("Model training")
    print("==============")

    # a method to determine root mean square percentage error of exponential of predictions
    def exp_rmspe(y_true, y_pred):
        """Competition evaluation metric, expects logarmithic inputs."""
        pct = tf.square((tf.exp(y_true) - tf.exp(y_pred)) / tf.exp(y_true))

        # compute mean excluding stores with zero denominator
        x = tf.reduce_sum(tf.where(y_true > 0.001, pct, tf.zeros_like(pct)))
        y = tf.reduce_sum(
            tf.where(y_true > 0.001, tf.ones_like(pct), tf.zeros_like(pct)))
        return tf.sqrt(x / y)

    def act_sigmoid_scaled(x):
        """Sigmoid scaled to logarithm of maximum sales scaled by 20%."""
        return tf.nn.sigmoid(x) * tf.math.log(max_sales) * 1.2

    # NOTE: exp_rmse and act_sigmoid_scaled functions are not placed at the module level
    # this is because we cannot explicitly send max_sales as an argument to act_sigmoid_scaled since it is an activation function
    # two of them are custom objects, and placing one at the module level and the other within the function doesn't really add up

    all_cols = CATEGORICAL_COLS + CONTINUOUS_COLS
    CUSTOM_OBJECTS = {
        "exp_rmspe": exp_rmspe,
        "act_sigmoid_scaled": act_sigmoid_scaled
    }

    # disable GPUs when building the model to prevent memory leaks
    if LooseVersion(tf.__version__) >= LooseVersion("2.0.0"):
        # See https://github.com/tensorflow/tensorflow/issues/33168
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    else:
        K.set_session(
            tf.Session(config=tf.ConfigProto(device_count={"GPU": 0})))

    # build the Keras model
    inputs = {col: Input(shape=(1, ), name=col) for col in all_cols}
    embeddings = [
        Embedding(len(vocab[col]), 10, input_length=1,
                  name="emb_" + col)(inputs[col]) for col in CATEGORICAL_COLS
    ]
    continuous_bn = Concatenate()([
        Reshape((1, 1), name="reshape_" + col)(inputs[col])
        for col in CONTINUOUS_COLS
    ])
    continuous_bn = BatchNormalization()(continuous_bn)
    x = Concatenate()(embeddings + [continuous_bn])
    x = Flatten()(x)
    x = Dense(1000,
              activation="relu",
              kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x)
    x = Dense(1000,
              activation="relu",
              kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x)
    x = Dense(1000,
              activation="relu",
              kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x)
    x = Dense(500,
              activation="relu",
              kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x)
    x = Dropout(0.5)(x)
    # specify element-wise activation
    output = Dense(1, activation=act_sigmoid_scaled)(x)
    model = tf.keras.Model([inputs[f] for f in all_cols], output)
    # display the details of the Keras model
    model.summary()

    opt = tf.keras.optimizers.Adam(lr=hp.learning_rate, epsilon=1e-3)

    # checkpoint callback to specify the options for the returned Keras model
    ckpt_callback = BestModelCheckpoint(monitor="val_loss",
                                        mode="auto",
                                        save_freq="epoch")

    # create an object of Store class
    store = Store.create(work_dir.remote_source)
    # 'SparkBackend' uses `horovod.spark.run` to execute the distributed training function, and
    # returns a list of results by running 'train' on every worker in the cluster
    backend = SparkBackend(
        num_proc=hp.num_proc,
        stdout=sys.stdout,
        stderr=sys.stderr,
        prefix_output_with_timestamp=True,
    )
    # define a Spark Estimator that fits Keras models to a DataFrame
    keras_estimator = hvd.KerasEstimator(
        backend=backend,
        store=store,
        model=model,
        optimizer=opt,
        loss="mae",
        metrics=[exp_rmspe],
        custom_objects=CUSTOM_OBJECTS,
        feature_cols=all_cols,
        label_cols=["Sales"],
        validation="Validation",
        batch_size=hp.batch_size,
        epochs=hp.epochs,
        verbose=2,
        checkpoint_callback=ckpt_callback,
    )

    # The Estimator hides the following details:
    # 1. Binding Spark DataFrames to a deep learning training script
    # 2. Reading data into a format that can be interpreted by the training framework
    # 3. Distributed training using Horovod
    # the user would provide a Keras model to the `KerasEstimator``
    # this `KerasEstimator`` will fit the data and store it in a Spark DataFrame
    keras_model = keras_estimator.fit(train_df).setOutputCols(["Sales_output"])
    # retrieve the model training history
    history = keras_model.getHistory()
    best_val_rmspe = min(history["val_exp_rmspe"])
    print("Best RMSPE: %f" % best_val_rmspe)

    # save the trained model
    keras_model.save(os.path.join(working_dir, hp.local_checkpoint_file))
    print("Written checkpoint to %s" %
          os.path.join(working_dir, hp.local_checkpoint_file))
    # the Estimator returns a Transformer representation of the trained model once training is complete
    return keras_model