Beispiel #1
0
def make_model(data):
    data.show()
    data = data.dropna()
    nb_classes = data.select("label").distinct().count()
    input_dim = len(data.select("features").first()[0])

    print(nb_classes, input_dim)

    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=100))
    #model.add(LSTM(64,return_sequences=False,dropout=0.1,recurrent_dropout=0.1))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes, activation='softmax'))
    #sgd = optimizers.SGD(lr=0.1)
    #model.compile(sgd, 'categorical_crossentropy', ['acc'])
    model.compile(loss='binary_crossentropy', optimizer='adam')

    #model.compile(loss='categorical_crossentropy', optimizer='adam')
    spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')

    adam = optimizers.Adam(lr=0.01)
    opt_conf = optimizers.serialize(adam)

    estimator = ElephasEstimator()
    estimator.setFeaturesCol("features")
    estimator.setLabelCol("label")
    estimator.set_keras_model_config(model.to_yaml())
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(nb_classes)
    estimator.set_num_workers(1)
    estimator.set_epochs(20)
    estimator.set_batch_size(128)
    estimator.set_verbosity(1)
    estimator.set_validation_split(0.15)
    estimator.set_optimizer_config(opt_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])

    #estimator = ElephasEstimator(model, epochs=20, batch_size=32, frequency='batch', mode='asynchronous', nb_classes=1)

    pipeline = Pipeline(stages=[estimator])
    #fitted_model = estimator.fit(data)
    #prediction = fitted_model.transform(data)

    fitted_pipeline = pipeline.fit(data)  # Fit model to data
    prediction = fitted_pipeline.transform(data)  # Evaluate on train data.
    # prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data.
    pnl = prediction.select("text", "prediction")
    pnl.show(100)

    prediction_and_label = pnl.map(lambda row: (row.text, row.prediction))
    metrics = MulticlassMetrics(prediction_and_label)
    print(metrics.precision())
    pnl = prediction.select("label", "prediction").show()
    pnl.show(100)
Beispiel #2
0
def test_set_cols_deprecated(spark_context, regression_model,
                             boston_housing_dataset):
    with pytest.deprecated_call():
        batch_size = 64
        epochs = 10

        x_train, y_train, x_test, y_test = boston_housing_dataset
        df = to_data_frame(spark_context, x_train, y_train)
        df = df.withColumnRenamed('features', 'scaled_features')
        df = df.withColumnRenamed('label', 'ground_truth')
        test_df = to_data_frame(spark_context, x_test, y_test)
        test_df = test_df.withColumnRenamed('features', 'scaled_features')
        test_df = test_df.withColumnRenamed('label', 'ground_truth')

        sgd = optimizers.SGD(lr=0.00001)
        sgd_conf = optimizers.serialize(sgd)
        estimator = ElephasEstimator()
        estimator.set_keras_model_config(regression_model.to_yaml())
        estimator.set_optimizer_config(sgd_conf)
        estimator.setFeaturesCol('scaled_features')
        estimator.setOutputCol('output')
        estimator.setLabelCol('ground_truth')
        estimator.set_mode("synchronous")
        estimator.set_loss("mae")
        estimator.set_metrics(['mae'])
        estimator.set_epochs(epochs)
        estimator.set_batch_size(batch_size)
        estimator.set_validation_split(0.01)
        estimator.set_categorical_labels(False)

        pipeline = Pipeline(stages=[estimator])
        fitted_pipeline = pipeline.fit(df)
        prediction = fitted_pipeline.transform(test_df)
        pnl = prediction.select("ground_truth", "output")
        pnl.show(100)

        prediction_and_observations = pnl.rdd.map(
            lambda row: (row['ground_truth'], row['output']))
        metrics = RegressionMetrics(prediction_and_observations)
        print(metrics.r2)
Beispiel #3
0
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

sgd = optimizers.SGD(lr=0.01)
sgd_conf = optimizers.serialize(sgd)

# Initialize Elephas Spark ML Estimator
estimator = ElephasEstimator()
estimator.set_keras_model_config(model.to_yaml())
estimator.set_optimizer_config(sgd_conf)
estimator.set_mode("synchronous")
estimator.set_loss("categorical_crossentropy")
estimator.set_metrics(['acc'])
estimator.setFeaturesCol("scaled_features")
estimator.setLabelCol("index_category")
estimator.set_epochs(10)
estimator.set_batch_size(128)
estimator.set_num_workers(1)
estimator.set_verbosity(0)
estimator.set_validation_split(0.15)
estimator.set_categorical_labels(True)
estimator.set_nb_classes(nb_classes)

# Fitting a model returns a Transformer
pipeline = Pipeline(stages=[string_indexer, scaler, estimator])
fitted_pipeline = pipeline.fit(train_df)

# Evaluate Spark model
prediction = fitted_pipeline.transform(train_df)
          activity_regularizer=regularizers.l2(0.01)))
model.add(Activation('relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(256, activity_regularizer=regularizers.l2(0.01)))
model.add(Activation('relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(nb_classes))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='sgd')

# Model Summary
model.summary()

# Initialize SparkML Estimator and Get Settings
estimator = ElephasEstimator()
estimator.setFeaturesCol("features")
estimator.setLabelCol("label_index")
estimator.set_keras_model_config(model.to_yaml())
estimator.set_categorical_labels(True)
estimator.set_nb_classes(nb_classes)
estimator.set_num_workers(1)
estimator.set_epochs(25)
estimator.set_batch_size(64)
estimator.set_verbosity(1)
estimator.set_validation_split(0.10)
estimator.set_optimizer_config(sgd)
estimator.set_mode("synchronous")
estimator.set_loss("binary_crossentropy")
estimator.set_metrics(['acc'])

# Create Deep Learning Pipeline
Beispiel #5
0
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')


# Initialize Elephas Spark ML Estimator
adagrad = elephas_optimizers.Adagrad()

estimator = ElephasEstimator()
estimator.setFeaturesCol("scaled_features")
estimator.setLabelCol("index_category")
estimator.set_keras_model_config(model.to_yaml())
estimator.set_optimizer_config(adagrad.get_config())
estimator.set_nb_epoch(10)
estimator.set_batch_size(128)
estimator.set_num_workers(4)
estimator.set_verbosity(0)
estimator.set_validation_split(0.15)
estimator.set_categorical_labels(True)
estimator.set_nb_classes(nb_classes)

# Fitting a model returns a Transformer
pipeline = Pipeline(stages=[string_indexer, scaler, estimator])
fitted_pipeline = pipeline.fit(train_df)
Beispiel #6
0
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')




from elephas.ml_model import ElephasEstimator
from elephas import optimizers as elephas_optimizers

# Define elephas optimizer (which tells the model how to aggregate updates on the Spark master)
adadelta = elephas_optimizers.Adadelta()

# Initialize SparkML Estimator and set all relevant properties
estimator = ElephasEstimator()
estimator.setFeaturesCol("scaled_features")             # These two come directly from pyspark,
estimator.setLabelCol("index_category")                 # hence the camel case. Sorry :)
estimator.set_keras_model_config(model.to_yaml())       # Provide serialized Keras model
estimator.set_optimizer_config(adadelta.get_config())   # Provide serialized Elephas optimizer
estimator.set_categorical_labels(True)
estimator.set_nb_classes(nb_classes)
estimator.set_num_workers(1)  # We just use one worker here. Feel free to adapt it.
estimator.set_nb_epoch(20) 
estimator.set_batch_size(128)
estimator.set_verbosity(1)
estimator.set_validation_split(0.15)




from pyspark.ml import Pipeline