def test_batch_predict_classes_probability(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer fitted_pipeline = estimator.fit(df) results = fitted_pipeline.transform(test_df) # Set inference batch size and do transform again on the same test_df inference_batch_size = int(len(y_test) / 10) fitted_pipeline.set_params(inference_batch_size=inference_batch_size) fitted_pipeline.set_params(outputCol="prediction_via_batch_inference") results_with_batch_prediction = fitted_pipeline.transform(results) # we should have an array of 10 elements in the prediction column, since we have 10 classes # and therefore 10 probabilities results_np = results_with_batch_prediction.take(1)[0] assert len(results_np.prediction) == 10 assert len(results_np.prediction_via_batch_inference) == 10 assert np.array_equal(results_np.prediction, results_np.prediction_via_batch_inference)
df = to_data_frame(sc, X_train, Y_train, categorical=True) test_df = to_data_frame(sc, X_test, Y_test, categorical=True) # Initialize Spark ML Estimator adadelta = elephas_optimizers.Adadelta() estimator = ElephasEstimator(sc, model, nb_epoch=nb_epoch, batch_size=batch_size, optimizer=adadelta, frequency='batch', mode='asynchronous', num_workers=2, verbose=0, validation_split=0.1, categorical=True, nb_classes=nb_classes) # Fitting a model returns a Transformer fitted_model = estimator.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_model.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision()) print(metrics.recall())
df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) # Initialize Spark ML Estimator adadelta = elephas_optimizers.Adadelta() estimator = ElephasEstimator(sc, model, nb_epoch=nb_epoch, batch_size=batch_size, optimizer=adadelta, frequency='batch', mode='asynchronous', num_workers=2, verbose=0, validation_split=0.1, categorical=True, nb_classes=nb_classes) # Fitting a model returns a Transformer fitted_model = estimator.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_model.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision()) print(metrics.recall())
estimator.setFeaturesCol("features") # These two come directly from pyspark, estimator.setLabelCol("label") # hence the camel case. Sorry :) estimator.set_keras_model_config( model.to_yaml()) # Provide serialized Keras model estimator.set_optimizer_config( adadelta.get_config()) # Provide serialized Elephas optimizer estimator.set_categorical_labels(True) estimator.set_nb_classes(2) estimator.set_num_workers( 1) # We just use one worker here. Feel free to adapt it. estimator.set_nb_epoch(20) estimator.set_batch_size(128) estimator.set_verbosity(1) estimator.set_validation_split(0.15) fitted_model = estimator.fit(train_df) prediction = fitted_model.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) #from pyspark.ml import Pipeline #pipeline = Pipeline(stages=[estimator]) #fitted_pipeline = pipeline.fit(train_df) # Fit model to data #prediction = fitted_pipeline.transform(test_df) #pnl = prediction.select("index_category", "prediction") #pnl.show(100)