def _test_optimizer(optimizer, target=0.75): x_train, y_train = get_test_data() model = get_model(x_train.shape[1], 10, y_train.shape[1]) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0) assert history.history['acc'][-1] >= target config = optimizers.serialize(optimizer) custom_objects = {optimizer.__class__.__name__: optimizer.__class__} optim = optimizers.deserialize(config, custom_objects) new_config = optimizers.serialize(optim) assert config == new_config
def test_functional_model(spark_context, classification_model_functional, mnist_data): batch_size = 64 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD() sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model_functional.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(10) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.accuracy)
def test_custom_objects(spark_context, boston_housing_dataset): def custom_activation(x): return 2 * relu(x) model = Sequential() model.add(Dense(64, input_shape=(13, ))) model.add(Dense(64, activation=custom_activation)) model.add(Dense(1, activation='linear')) x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) test_df = to_data_frame(spark_context, x_test, y_test) sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(10) estimator.set_batch_size(32) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) estimator.set_custom_objects({'custom_activation': custom_activation}) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df)
def test_regression_model(spark_context, regression_model, boston_housing_dataset): batch_size = 64 epochs = 10 x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) test_df = to_data_frame(spark_context, x_test, y_test) sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(regression_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_observations = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = RegressionMetrics(prediction_and_observations) print(metrics.r2)
def get_config(self): config = { 'optimizer': optimizers.serialize(self.optimizer), 'multipliers': self.multipliers } base_config = super(LRMultiplier, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def test_spark_ml_model_classification(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) # since prediction in a multiclass classification problem is a vector, we need to compute argmax # the casting to a double is just necessary for using MulticlassMetrics pnl = pnl.select( 'label', argmax('prediction').astype(DoubleType()).alias('prediction')) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.accuracy)
def test_batch_predict_classes_probability(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer fitted_pipeline = estimator.fit(df) results = fitted_pipeline.transform(test_df) # Set inference batch size and do transform again on the same test_df inference_batch_size = int(len(y_test) / 10) fitted_pipeline.set_params(inference_batch_size=inference_batch_size) fitted_pipeline.set_params(outputCol="prediction_via_batch_inference") results_with_batch_prediction = fitted_pipeline.transform(results) # we should have an array of 10 elements in the prediction column, since we have 10 classes # and therefore 10 probabilities results_np = results_with_batch_prediction.take(1)[0] assert len(results_np.prediction) == 10 assert len(results_np.prediction_via_batch_inference) == 10 assert np.array_equal(results_np.prediction, results_np.prediction_via_batch_inference)
def test_save_pipeline(spark_context, classification_model): sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(10) estimator.set_batch_size(10) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(10) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) pipeline.save('tmp')
def test_set_cols_deprecated(spark_context, regression_model, boston_housing_dataset): with pytest.deprecated_call(): batch_size = 64 epochs = 10 x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) df = df.withColumnRenamed('features', 'scaled_features') df = df.withColumnRenamed('label', 'ground_truth') test_df = to_data_frame(spark_context, x_test, y_test) test_df = test_df.withColumnRenamed('features', 'scaled_features') test_df = test_df.withColumnRenamed('label', 'ground_truth') sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(regression_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.setFeaturesCol('scaled_features') estimator.setOutputCol('output') estimator.setLabelCol('ground_truth') estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("ground_truth", "output") pnl.show(100) prediction_and_observations = pnl.rdd.map( lambda row: (row['ground_truth'], row['output'])) metrics = RegressionMetrics(prediction_and_observations) print(metrics.r2)
def get_config(self): config = { 'units': self.units, 'learning_rate': self.learning_rate, 'online': self.online, 'n_passes': self.n_passes, 'return_hidden': self.return_hidden, 'visible_activation': activations.serialize(self.visible_activation), 'hidden_activation': activations.serialize(self.hidden_activation), 'use_bias': self.use_bias, 'kernel_initializer': initializers.serialize(self.kernel_initializer), 'bias_initializer': initializers.serialize(self.bias_initializer), 'kernel_regularizer': regularizers.serialize(self.kernel_regularizer), 'bias_regularizer': regularizers.serialize(self.bias_regularizer), 'activity_regularizer': regularizers.serialize(self.activity_regularizer), 'kernel_constraint': constraints.serialize(self.kernel_constraint), 'bias_constraint': constraints.serialize(self.bias_constraint), 'optimizer': optimizers.serialize(self.optimizer) } base_config = super(OnlineBolzmannCell, self).get_config() return dict(list(base_config.items()) + list(config.items()))
def test_predict_classes_probability(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_predict_classes(False) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) results = fitted_pipeline.transform(test_df) # we should have an array of 10 elements in the prediction column, since we have 10 classes # and therefore 10 probabilities assert len(results.take(1)[0].prediction) == 10
model.add(Dense(512, input_shape=(input_dim, ))) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') sgd = optimizers.SGD(lr=0.01) sgd_conf = optimizers.serialize(sgd) # Initialize Elephas Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.setFeaturesCol("scaled_features") estimator.setLabelCol("index_category") estimator.set_epochs(10) estimator.set_batch_size(128) estimator.set_num_workers(1) estimator.set_verbosity(0) estimator.set_validation_split(0.15)