def test_keras_model_checkpoint_callback(self, mock_fit_fn, mock_pin_gpu_fn): from horovod.tensorflow.keras.callbacks import BestModelCheckpoint def _get_mock_fit_fn(checkpoint_callback_provided): def fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks, verbose): returned_model_checkpoint_present = False model_checkpoint_present = False for callback in callbacks: callback.set_model(model) if checkpoint_callback_provided: callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3}) else: callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3}) if checkpoint_callback_provided and isinstance(callback, BestModelCheckpoint): self.assertIsNotNone(callback.filepath) self.assertTrue(callback.save_best_only) self.assertEqual(callback.monitor, 'binary_crossentropy') returned_model_checkpoint_present = True if not checkpoint_callback_provided and isinstance(callback, tf.keras.callbacks.ModelCheckpoint): self.assertFalse(callback.save_best_only) self.assertFalse(callback.save_best_only) self.assertEqual(callback.monitor, 'val_loss') model_checkpoint_present = True if checkpoint_callback_provided: self.assertTrue(returned_model_checkpoint_present) self.assertFalse(model_checkpoint_present) else: self.assertFalse(returned_model_checkpoint_present) self.assertTrue(model_checkpoint_present) return mock.Mock() return fit mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_model_chekcpoint_callbacks') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' # Test when the checkpoint callback is not set, the correct one is created mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=False) est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count() # Test if checkpoint call back is correctly set to the model mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=True) checkpoint_callback = BestModelCheckpoint(monitor='binary_crossentropy') est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, checkpoint_callback=checkpoint_callback) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def train( max_sales: float, vocab: Dict[str, List[Any]], hp: Hyperparameters, work_dir: FlyteDirectory, train_df: pyspark.sql.DataFrame, working_dir: FlyteDirectory, ): print("==============") print("Model training") print("==============") # a method to determine root mean square percentage error of exponential of predictions def exp_rmspe(y_true, y_pred): """Competition evaluation metric, expects logarmithic inputs.""" pct = tf.square((tf.exp(y_true) - tf.exp(y_pred)) / tf.exp(y_true)) # compute mean excluding stores with zero denominator x = tf.reduce_sum(tf.where(y_true > 0.001, pct, tf.zeros_like(pct))) y = tf.reduce_sum( tf.where(y_true > 0.001, tf.ones_like(pct), tf.zeros_like(pct))) return tf.sqrt(x / y) def act_sigmoid_scaled(x): """Sigmoid scaled to logarithm of maximum sales scaled by 20%.""" return tf.nn.sigmoid(x) * tf.math.log(max_sales) * 1.2 # NOTE: exp_rmse and act_sigmoid_scaled functions are not placed at the module level # this is because we cannot explicitly send max_sales as an argument to act_sigmoid_scaled since it is an activation function # two of them are custom objects, and placing one at the module level and the other within the function doesn't really add up all_cols = CATEGORICAL_COLS + CONTINUOUS_COLS CUSTOM_OBJECTS = { "exp_rmspe": exp_rmspe, "act_sigmoid_scaled": act_sigmoid_scaled } # disable GPUs when building the model to prevent memory leaks if LooseVersion(tf.__version__) >= LooseVersion("2.0.0"): # See https://github.com/tensorflow/tensorflow/issues/33168 os.environ["CUDA_VISIBLE_DEVICES"] = "-1" else: K.set_session( tf.Session(config=tf.ConfigProto(device_count={"GPU": 0}))) # build the Keras model inputs = {col: Input(shape=(1, ), name=col) for col in all_cols} embeddings = [ Embedding(len(vocab[col]), 10, input_length=1, name="emb_" + col)(inputs[col]) for col in CATEGORICAL_COLS ] continuous_bn = Concatenate()([ Reshape((1, 1), name="reshape_" + col)(inputs[col]) for col in CONTINUOUS_COLS ]) continuous_bn = BatchNormalization()(continuous_bn) x = Concatenate()(embeddings + [continuous_bn]) x = Flatten()(x) x = Dense(1000, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x) x = Dense(1000, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x) x = Dense(1000, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x) x = Dense(500, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.00005))(x) x = Dropout(0.5)(x) # specify element-wise activation output = Dense(1, activation=act_sigmoid_scaled)(x) model = tf.keras.Model([inputs[f] for f in all_cols], output) # display the details of the Keras model model.summary() opt = tf.keras.optimizers.Adam(lr=hp.learning_rate, epsilon=1e-3) # checkpoint callback to specify the options for the returned Keras model ckpt_callback = BestModelCheckpoint(monitor="val_loss", mode="auto", save_freq="epoch") # create an object of Store class store = Store.create(work_dir.remote_source) # 'SparkBackend' uses `horovod.spark.run` to execute the distributed training function, and # returns a list of results by running 'train' on every worker in the cluster backend = SparkBackend( num_proc=hp.num_proc, stdout=sys.stdout, stderr=sys.stderr, prefix_output_with_timestamp=True, ) # define a Spark Estimator that fits Keras models to a DataFrame keras_estimator = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=opt, loss="mae", metrics=[exp_rmspe], custom_objects=CUSTOM_OBJECTS, feature_cols=all_cols, label_cols=["Sales"], validation="Validation", batch_size=hp.batch_size, epochs=hp.epochs, verbose=2, checkpoint_callback=ckpt_callback, ) # The Estimator hides the following details: # 1. Binding Spark DataFrames to a deep learning training script # 2. Reading data into a format that can be interpreted by the training framework # 3. Distributed training using Horovod # the user would provide a Keras model to the `KerasEstimator`` # this `KerasEstimator`` will fit the data and store it in a Spark DataFrame keras_model = keras_estimator.fit(train_df).setOutputCols(["Sales_output"]) # retrieve the model training history history = keras_model.getHistory() best_val_rmspe = min(history["val_exp_rmspe"]) print("Best RMSPE: %f" % best_val_rmspe) # save the trained model keras_model.save(os.path.join(working_dir, hp.local_checkpoint_file)) print("Written checkpoint to %s" % os.path.join(working_dir, hp.local_checkpoint_file)) # the Estimator returns a Transformer representation of the trained model once training is complete return keras_model