Esempio n. 1
0
    def test_raw_and_probability_prediction(self):

        data_path = "data/mllib/sample_multiclass_classification_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3],
                                             blockSize=128, seed=123)
        model = mlp.fit(df)
        test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF()
        result = model.transform(test).head()
        expected_prediction = 2.0
        expected_probability = [0.0, 0.0, 1.0]
        expected_rawPrediction = [57.3955, -124.5462, 67.9943]
        self.assertTrue(result.prediction, expected_prediction)
        self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4))
        self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4))
def naiveBayeseian():

    def parseLine(line):
        keys  = [float(x) for x in line.split(",")]
        #return LabeledPoint(keys[0],keys[1:])
        return keys
    scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv")
    data= scdata1.map(parseLine)
    splits = data.randomSplit([0.8, 0.2], 1234)
    train = splits[0]
    test = splits[1]
    layers = [30, 20, 20, 2]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
def price_predict(path, windows=5, spark_contest=None, sql_context=None):
    if spark_contest is None:
        spark_contest, sql_context = load_spark_context()
    input_data = DataParser(path=path, window_size=windows)
    close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data(
        data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context)
    evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION)

    # handle open data
    open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128,
                                                  featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    open_model = open_trainer.fit(open_train_df)
    open_result = open_model.transform(open_test_df)
    open_prediction_labels = open_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(open_prediction_labels)))

    # handle close data
    close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128,
                                                   featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    close_model = close_trainer.fit(close_train_df)
    close_result = close_model.transform(close_test_df)
    close_prediction_labels = close_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
    # Split data set
    training_df, testing_df = no_emptys_df.randomSplit([.75, .25])

    # Make Spark ML pipeline using a NaiveBayes classifier (for now)
    hashingTF = HashingTF(inputCol='words',
                          outputCol='word_hash',
                          numFeatures=500)
    idf = IDF(minDocFreq=1,
              inputCol=hashingTF.getOutputCol(),
              outputCol='tf-idf')
    va = VectorAssembler(inputCols=[
        'has_link', 'verb_count', 'tf-idf', 'word_count', 'has_q', 'has_tag'
    ])
    mp = MultilayerPerceptronClassifier(
        featuresCol=va.getOutputCol(),
        layers=[505, 250, 100, 50, 25, 10, 5, 2])

    # Create param grid
    grid = ParamGridBuilder().addGrid(mp.maxIter, [50, 100, 200]).addGrid(
        mp.tol,
        [.0000001, .000001, .0001, .01]).addGrid(mp.stepSize,
                                                 [.001, .01, .1]).build()

    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')

    pipeline = Pipeline(stages=[hashingTF, idf, va, mp])

    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=grid,
                        evaluator=evaluator,
    # Preliminary analysis
    #################################################################
    print(clean_riskdata.describe().show())
    print(riskdata.stat.crosstab("bad","job").show())
    print(riskdata.stat.crosstab("bad","reason").show())
    #################################################################
    # Multilayer Perceptron Classifier
    #################################################################

    # specify layers for the neural network:
    # input layer of size 10 (features), two intermediate of size 3 and 2
    # and output of size 2 (classes)
    layers = [10, 3, 2, 2]
    # create the trainer and set its parameters
    MLPtrainer = MultilayerPerceptronClassifier(maxIter = 100, layers = layers,
                                             labelCol = "bad", featuresCol = "predictors",
                                             predictionCol = "prediction", 
                                             blockSize = 1000, seed = 1234)
    # train the model
    MLP_model = MLPtrainer.fit(train)
    
    # compute precision on the test set
    MLP_result = MLP_model.transform(test)
    MLP_predictionAndLabels = MLP_result.select("prediction", "bad")
    MLP_evaluator = MulticlassClassificationEvaluator(metricName="precision")
    #print(MLP_model)
    #print(str(MLP_result.show())) # Print first 20 rows result to output file (plain text)

""""
    #################################################################
    # Decision Tree Classification
    #################################################################
Esempio n. 6
0
        'oh_s_gender', 'oh_s_geography', 'CreditScore', 'Age', 'Tenure',
        'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
        'EstimatedSalary'
    ],
                    outputCol='features'))

# stage for scaling the features using MinMax scaler
stages.append(MinMaxScaler(inputCol='features', outputCol='scaledfeatures'))

# stage for MultilayerPerceptronClassifier(ANN implementation in Spark)
layers = [
    13, 6, 6, 6, 2
]  # 13- input features, two hidden layers with 5 neurons, output layer with 2 neurons(for 2 o/p labels)
stages.append(
    MultilayerPerceptronClassifier(labelCol="s_exited",
                                   featuresCol="scaledfeatures",
                                   maxIter=200,
                                   layers=layers))

#stage for reverse indexing the prediction label
stages.append(
    IndexToString(inputCol='prediction',
                  outputCol='lab_prediction',
                  labels=stages[0].labels))

#  making the pipeline model
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)  # Making Pipeline

# making/Training the model using trainingData
model = pipeline.fit(trainingData)
Esempio n. 7
0
# In[14]:

#set parameters for a KNN model

from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

layers = [[5, 3, 2], [5, 4, 2], [5, 5, 2]]
maxAccuracy = 0
bestLayer = []

for layer in layers:
    trainer = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=layer,
                                             blockSize=128)
    param = trainer.setParams(featuresCol="features", labelCol="target")
    #use K-Fold validation to tune the model
    #pyspark library
    grid = ParamGridBuilder().build()
    # .addGrid(trainer.maxIter, [0, 1]) random forest
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                              labelCol="target")
    cv = CrossValidator(estimator=trainer,
                        estimatorParamMaps=grid,
                        evaluator=evaluator,
                        numFolds=5)
    cv.extractParamMap()
    cvModel = cv.fit(df_train)
    print(layer)
Esempio n. 8
0
    pca_train_result = model_200.transform(train_vectors_withlabel).selectExpr(
        'label_train as label', 'pca_vector as feature')
    pca_test_result = model_200.transform(test_vectors_withlabel).selectExpr(
        'label_test as label', 'pca_vector as feature')

    # define parameters
    input_layer = 200  # number of features
    output_layer = 10  # output 0~9
    hidden_1 = 150
    hidden_2 = 150
    layers = [input_layer, hidden_1, hidden_2, output_layer]

    MPC = MultilayerPerceptronClassifier(featuresCol='feature',
                                         labelCol='label',
                                         predictionCol='prediction',
                                         maxIter=400,
                                         layers=layers,
                                         blockSize=128,
                                         seed=123)

    model = MPC.fit(pca_train_result)

    result = model.transform(pca_test_result).select("label", "prediction")
    result_lp = result.selectExpr("label",
                                  "cast (prediction as int) prediction")
    final_result = result_lp.rdd
    count = final_result.count()

    # calculate the accuracy

    neutral_zero_value = 0
Esempio n. 9
0
# Dividimos el dataset en train y test
splits = dataset.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

# Especificamos las capas que tiene la red neuronal layers = [9, 9, 9, 10]
layers = [9, 9, 9, 10]

now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)

# Creamos el modelo de red neuronal, lo entrenamos y realizamos la prediccion
mpc = MultilayerPerceptronClassifier(layers=layers,
                                     labelCol='attack_cat_index',
                                     featuresCol='features',
                                     seed=1234,
                                     predictionCol='prediction')
mpc = mpc.fit(train)

now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)
result = mpc.transform(test)
dataset.show(25)
result.show(25)

# Evaluamos la prediccion
evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(result)
print("Accuracy = {}".format(accuracy))
Esempio n. 10
0
print("Finished randomSplit")

processing = datetime.now()
processing_time = (processing - start).seconds
print("Processing time = {}".format(processing_time))

featured_data.rdd.saveAsTextFile(sys.argv[8])

classifiers = [
    LogisticRegression(labelCol='EXPIRE_FLAG'),
    LinearSVC(labelCol='EXPIRE_FLAG'),
    DecisionTreeClassifier(labelCol='EXPIRE_FLAG'),
    RandomForestClassifier(labelCol='EXPIRE_FLAG'),
    GBTClassifier(labelCol='EXPIRE_FLAG'),
    MultilayerPerceptronClassifier(labelCol='EXPIRE_FLAG',
                                   layers=[34, 20, 20, 2]),
    NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol='EXPIRE_FLAG')
]

for classifier in classifiers:
    model = classifier.fit(train)
    predictions = model.transform(test)

    evaluator = MulticlassClassificationEvaluator(labelCol="EXPIRE_FLAG",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    accuracy

    print("Model is: ", model)
    print("Accuracy is: ", accuracy)
Esempio n. 11
0
for i in range(len(label)):
    labelDict[label[i]] = i
labelValIndex = list(labelDict.items())
labelRdd = sc.parallelize(labelValIndex)
labelDF = spark.createDataFrame(labelRdd, ['secID', 'index'])
labelDF.write.save('hdfs://master:9000//test/labelIndexer_{}'.format(index),
                   format='parquet',
                   mode='append')

# df = spark.read.format('parquet').load('hdfs://master:9000//sparkExperiment/labelIndexer/labelIndexer_60438')
inputNode = len(columnName) - 1
outputNode = len(label)
layers = [inputNode, 5, 4, outputNode]
trainer = MultilayerPerceptronClassifier(featuresCol="features",
                                         labelCol="label",
                                         maxIter=100,
                                         layers=layers,
                                         blockSize=128,
                                         seed=1234)
trainData = trainData.select("features", "indexedLabel").selectExpr(
    "features as features", "indexedLabel as label")
model = trainer.fit(trainData)
test = sc.textFile(
    'hdfs://master:9000//fcd/split/test/397-290_testDataSplit/testData_{}.csv'.
    format(index))
test = test.map(lambda line: line.split(','))
columnName = test.take(1)[0]
test = test.filter(lambda row: row != columnName).toDF(columnName)
test = test.rdd.map(lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF(
    ["features", "label"])
model.save('hdfs://master:9000//test/model_{}'.format(index))
pred = model.transform(test)
Esempio n. 12
0
df_train = spark_functions.prepare_features(df_train)

assembler = VectorAssembler(
    inputCols=['latitude', 'longitude', 'gps_height', 'construction_year'],
    outputCol="features")

scaler = StandardScaler(inputCol='features',
                        outputCol='features_scaled',
                        withStd=True,
                        withMean=False)

labelIndexer = StringIndexer(inputCol="status_group",
                             outputCol="label").fit(df_train)

mlp = MultilayerPerceptronClassifier(seed=42)
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction",
                               outputCol="status_group_prediction",
                               labels=labelIndexer.labels)

param_grid = ParamGridBuilder()\
    .addGrid(assembler.outputCol, ['features'])\
    .addGrid(mlp.maxIter, [100])\
    .addGrid(mlp.layers, [[4, 10, 3]])\
    .addGrid(mlp.blockSize, [1])\
    .build()

pipeline = Pipeline(
    # make a new column with a vector of features
    v_assembler = VectorAssembler(inputCols=features_list, outputCol='features')

    return v_assembler.transform(data)

if __name__ == "__main__":

    # create SparkSession - the entry to the cluster
    spark = SparkSession.builder.master("spark://192.168.50.10:7077").appName("MLP - MNIST").getOrCreate()

    train = prepare_mnist_data("mnist_train.csv")
    test = prepare_mnist_data("mnist_test.csv")


    mlp = MultilayerPerceptronClassifier(layers=[28*28, 50, 10])

    model = mlp.fit(train)

    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

    prediction_and_labels = model.transform(train).select("prediction", "label")
    print("Precision train: " + str(evaluator.evaluate(prediction_and_labels)))

    prediction_and_labels = model.transform(test).select("prediction", "label")
    print("Precision test: " + str(evaluator.evaluate(prediction_and_labels)))




Esempio n. 14
0
nb = NaiveBayes(modelType="multinomial")
nb_model = nb.fit(train_df)
nb_predictions_df = nb_model.transform(test_df)
nb_predictions_df.take(1)

nb_evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                 predictionCol="prediction",
                                                 metricName="accuracy")
nb_accuracy = nb_evaluator.evaluate(nb_predictions_df)
print(nb_accuracy)

# Multi layer perceptron
from pyspark.ml.classification import MultilayerPerceptronClassifier
layers = [4, 5, 5, 3]  # 4 layer MLP -> 2 excluding input and output

mlp = MultilayerPerceptronClassifier(layers=layers, seed=1)
mlp_model = mlp.fit(train_df)
mlp_predictions = mlp_model.transform(test_df)
mlp_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
mlp_accuracy = mlp_evaluator.evaluate(mlp_predictions)
print(mlp_accuracy)

# Decision trees
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = MulticlassClassificationEvaluator(metricName="accuracy",
                                                 labelCol="label",
                                                 predictionCol="prediction")
## naive bayes classifier
## logistic regression classifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier

nb = NaiveBayes(featuresCol="featuresCol", labelCol="label")
mlor = LogisticRegression(featuresCol='indexed_features', labelCol='label')
dt = DecisionTreeClassifier(featuresCol="indexed_features", labelCol="label")
rf = RandomForestClassifier(featuresCol="indexed_features", labelCol="label")
gbt = GBTClassifier(featuresCol="indexed_features", labelCol="label")
mpnn = MultilayerPerceptronClassifier(featuresCol="indexed_features", labelCol="label")


nb.fit(training).transform(training).select(['prediction']).distinct().show()
dt.fit(training).transform(training).select(['prediction']).distinct().show()
evaluator.evaluate(nb.fit(training).transform(training))


# build parameter grid
from pyspark.ml.tuning import ParamGridBuilder
# param grid for naive bayes
nb_param_grid = ParamGridBuilder().\
    addGrid(nb.smoothing, [0, 0.5, 1, 2, 5, 10]).\
    build()
# param grid for logistic regression
mlor_param_grid = ParamGridBuilder().\
    # load the data
    test_df = spark.read.csv(input_path + "Test-label-28x28.csv", \
              header=False, inferSchema="true").withColumnRenamed("_c0", "label")
    train_df = spark.read.csv(input_path + "Train-label-28x28.csv", \
               header=False, inferSchema="true").withColumnRenamed("_c0", "label")

    ##################### Preprocessing #####################
    # assembler
    feature_list = test_df.columns[1:]
    assembler = VectorAssembler(inputCols=feature_list, outputCol="features")

    ##################### Multilayer Perceptron #####################
    # Train a MultilayerPerceptron model.
    layers = [784, size, 10]
    perceptron = MultilayerPerceptronClassifier(maxIter=100, layers=layers, \
                 blockSize=30, seed=1234)

    ##################### Pipelined Model #####################
    pipeline_per = Pipeline(stages=[assembler, perceptron])

    # train the model
    model_per = pipeline_per.fit(train_df)

    ##################### Prediction #####################
    # make predictions
    result_per = model_per.transform(test_df)

    ##################### Evaluation #####################
    # compute accuracy
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
def prediction(titanic):

    performance = pd.DataFrame({'Name': ['Logistic Regression', "Logistic Regression - Cross Validation", "Random forest", "Random forest - Cross Validation", "Gradient-Boosted Tree Classifier", "Gradient-Boosted Tree Classifier - Cross Validation", "Decision Tree Classifier", "Decision Tree Classifier - Cross Validation", "Multilayer perceptron classifier", "Multilayer perceptron classifier - Cross Validation", "Naive Bayes"], 
                                'Test_SET (Area Under ROC)': [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], 
                                'Accuracy': [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                                'Best_Param': ["", "", "", "", "", "", "", "", "", "", ""]})

    # ================================DATA PREPROCESSING==================================================

    features = ['female', 'male', 'Q', 'C', 'S', 'low', 'mid', 'Very_low', 'very_high', 'high', 'Pclass', 'Age']
    titanic = titanic.select(F.col("Survived").alias("label"), *features)

    # Standardize features
    vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")
    standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
    stages = [vectorAssembler, standardScaler]
    pipeline = Pipeline(stages=stages)
    model = pipeline.fit(titanic)
    titanic = model.transform(titanic)

    # Randomly split data into training and test sets. Set seed for reproducibility
    (X_train, X_test) = titanic.randomSplit([0.7, 0.3], seed=1)

    # ================================MACHINE LEARNING ALGORITHMS=========================================

    results = []
    names = []

    # Logistic Regression
    name = "Logistic Regression"
    lr = LogisticRegression(labelCol="label")
    predictions_lr, model_lr, performance = run_ML(lr, X_train, X_test, performance, name)
    performance = binaryClassificationEvaluator(predictions_lr,  name, performance)
    results.append(pre_plot(predictions_lr.select("probability").toPandas()['probability']))
    names.append(name)
    # With Cross Validation
    name = "Logistic Regression - Cross Validation"
    predictions_lr_cv, model_lr_cv, performance = run_ML_regression_crossValidation(lr, X_train, X_test, performance, name)
    performance = binaryClassificationEvaluator(predictions_lr_cv, name, performance)
    results.append(pre_plot(predictions_lr_cv.select("probability").toPandas()['probability']))
    names.append(name)
    # ROC_Curve(model_lr)
    
    # Random forest
    name = "Random forest"
    rf = RandomForestClassifier(labelCol="label", featuresCol="features")
    predictions_rf, model_rf, performance = run_ML(rf, X_train, X_test, performance, name)
    performance= binaryClassificationEvaluator(predictions_rf, name, performance)
    performance = multiClassClassificationEvaluator(predictions_rf, name, performance)
    results.append(pre_plot(predictions_rf.select("probability").toPandas()['probability']))
    names.append(name)
    # With Cross Validation
    name = "Random forest - Cross Validation"
    predictions_rf_cv, model_rf_cv, performance = run_ML_random_crossValidation(rf, X_train, X_test, performance, name)
    performance = binaryClassificationEvaluator(predictions_rf_cv, name, performance)
    performance = multiClassClassificationEvaluator(predictions_rf_cv, name, performance)
    results.append(pre_plot(predictions_rf_cv.select("probability").toPandas()['probability']))
    names.append(name)

    # Gradient-Boosted Tree Classifier
    name = "Gradient-Boosted Tree Classifier"
    gbt = GBTClassifier(labelCol="label", featuresCol="features")
    predictions_gbt, model_gbt, performance = run_ML(gbt, X_train, X_test, performance, name)
    performance = binaryClassificationEvaluator(predictions_gbt, name, performance)
    performance = multiClassClassificationEvaluator(predictions_gbt, name, performance)
    results.append(pre_plot(predictions_gbt.select("probability").toPandas()['probability']))
    names.append(name)
    # With Cross Validation
    name = "Gradient-Boosted Tree Classifier - Cross Validation"
    predictions_gbt_cv, model_gbt_cv, performance = run_ML_gbt_crossValidation(gbt, X_train, X_test, performance, name)
    performance = binaryClassificationEvaluator(predictions_gbt_cv, name, performance)
    performance = multiClassClassificationEvaluator(predictions_gbt_cv, name, performance)
    results.append(pre_plot(predictions_gbt_cv.select("probability").toPandas()['probability']))
    names.append(name)

    # DecisionTree model
    name = "Decision Tree Classifier"
    dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
    predictions_dt, model_dt, performance = run_ML(dt, X_train, X_test, performance, name)
    performance = multiClassClassificationEvaluator(predictions_dt, name, performance)
    results.append(pre_plot(predictions_dt.select("probability").toPandas()['probability']))
    names.append(name)
    # With Cross Validation
    name = "Decision Tree Classifier - Cross Validation"
    predictions_dt_cv, model_dt_cv, performance = run_ML_dt_crossValidation(gbt, X_train, X_test, performance, name)
    performance = multiClassClassificationEvaluator(predictions_dt_cv, name, performance)
    results.append(pre_plot(predictions_dt_cv.select("probability").toPandas()['probability']))
    names.append(name)

    # Multilayer perceptron classifier
    name = "Multilayer perceptron classifier"
    layers = [len(features), 5, 4, 3]
    mpc = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", maxIter=100, layers=layers, blockSize=128)
    predictions_mpc, model_mpc, performance = run_ML(mpc, X_train, X_test, performance, name)
    performance = multiClassClassificationEvaluator(predictions_mpc,  name, performance)
    results.append(pre_plot(predictions_mpc.select("probability").toPandas()['probability']))
    names.append(name)
    # With Cross Validation
    name = "Multilayer perceptron classifier - Cross Validation"
    predictions_mpc_cv, model_mpc_cv, performance = run_ML_mpc_crossValidation(mpc, X_train, X_test, performance, name)
    performance = multiClassClassificationEvaluator(predictions_mpc_cv, name, performance)
    results.append(pre_plot(predictions_mpc_cv.select("probability").toPandas()['probability']))
    names.append(name)

    # Linear Support Vector Machine
    # lsvc = LinearSVC(maxIter=10, regParam=0.1)
    # run_ML(lsvc, X_train, X_test)
    # predictions_lsvc, model_lsvc  = run_ML(lsvc, X_train, X_test)
    # multiClassClassificationEvaluator(predictions_lsvc,  "Linear Support Vector Machine")
    # results.append(pre_plot(predictions_lsvc.select("probability").toPandas()['probability']))
    # names.append("Linear Support Vector Machine")

    # Naive Bayes
    name = "Naive Bayes"
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="label", featuresCol="features")
    predictions_nb, model_nb, performance = run_ML(nb, X_train, X_test, performance, name)
    performance = multiClassClassificationEvaluator(predictions_nb,  name, performance)
    results.append(pre_plot(predictions_nb.select("probability").toPandas()['probability']))
    names.append(name)
    
    """ Regression obviously doesn't work here, it's a classification problem
    # Linear regression
    # linr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    # run_ML(linr, X_train, X_test)
    # predictions_linr, model_linr  = run_ML(linr, X_train, X_test)
    # multiClassClassificationEvaluator(predictions_linr,  "Linear regression")
    # results.append(pre_plot(predictions_linr.select("probability").toPandas()['probability']))
    
    # Generalized linear regression
    # glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)
    # run_ML(glr, X_train, X_test)
    # predictions_glr, model_glr  = run_ML(glr, X_train, X_test)
    # multiClassClassificationEvaluator(predictions_glr,  "Generalized linear regression")
    # results.append(pre_plot(predictions_glr.select("probability").toPandas()['probability']))
    
    # Decision tree regression
    dtr = DecisionTreeRegressor(featuresCol="features")
    run_ML(dtr, X_train, X_test)
    predictions_dtr, model_dtr  = run_ML(dtr, X_train, X_test)
    regressionEvaluator(predictions_dtr,  "Decision tree regression")
    predictions_dtr.show()
    # results.append(pre_plot(predictions_dtr.select("probability").toPandas()['probability']))
    
    # Random forest regression
    rfr = RandomForestRegressor(featuresCol="features")
    run_ML(rfr, X_train, X_test)
    predictions_rfr, model_rfr  = run_ML(rfr, X_train, X_test)
    regressionEvaluator(predictions_rfr,  "Random forest regression")
    predictions_rfr.show()
    # results.append(pre_plot(predictions_rfr.select("probability").toPandas()['probability']))
    
    # Gradient-boosted tree regression
    gbtr = GBTRegressor(featuresCol="features", maxIter=10)
    run_ML(gbtr, X_train, X_test)
    predictions_gbtr, model_gbt  = run_ML(gbtr, X_train, X_test)
    regressionEvaluator(predictions_gbtr,  "Gradient-boosted tree regression")
    predictions_gbtr.show()
    # results.append(pre_plot(predictions_gbtr.select("probability").toPandas()['probability']))
    
    # Survival regression
    # quantileProbabilities = [0.3, 0.6]
    # aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles")
    # run_ML(aft, "Survival regression", X_train, X_test)
    # predictions_aft, model_aft  = run_ML(aft, X_train, X_test)
    # multiClassClassificationEvaluator(predictions_aft,  "Survival regression")
    # results.append(pre_plot(predictions_aft.select("probability").toPandas()['probability']))
    
    # Isotonic regression
    # it = IsotonicRegression()
    # run_ML(it, "Isotonic regression", X_train, X_test)
    # predictions_it, model_it  = run_ML(it, X_train, X_test)
    # multiClassClassificationEvaluator(predictions_it,  "Isotonic regression")
    # results.append(pre_plot(predictions_it.select("probability").toPandas()['probability']))
    """
    
    #================================BOXPLOT ALGORITHM COMPARISON========================================
    
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()

    final_result = spark.createDataFrame(performance)

    return final_result
Esempio n. 18
0
train_sents1_lower = train_sents1.withColumn('lower_sents', udf_lower('sentence1') )
# train_sents1_lower.show(5)

udf_rv_punc = F.udf(remove_punctuation_re, StringType() )
train_sents1_rv_punc = train_sents1_lower.withColumn('rv_punc_sents', udf_rv_punc('lower_sents') )

tokenizer = Tokenizer(inputCol="rv_punc_sents", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="filtered_tokens", outputCol="avg_word_embed")

doc2vec_pipeline = Pipeline(stages=[tokenizer,remover,w2v])
doc2vec_model = doc2vec_pipeline.fit(train_sents1_rv_punc)
doc2vecs_df = doc2vec_model.transform(train_sents1_rv_punc)
w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

genre2label = StringIndexer(inputCol="genre", outputCol="label")
rf_classifier = MultilayerPerceptronClassifier(labelCol="label", featuresCol="avg_word_embed")

rf_classifier_pipeline = Pipeline(stages=[genre2label,rf_classifier])
rf_predictions = rf_classifier_pipeline.fit(w2v_train_df).transform(w2v_test_df)

rf_model_evaluator = MulticlassClassificationEvaluator( \
    labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = rf_model_evaluator.evaluate(rf_predictions)
print("Accuracy = %g" % (accuracy))
Esempio n. 19
0
def main():
    #Loading the data
    player_data = spark.read.format("mongo").options(
        collection='players2').load()

    #Conversion to integers - Mongo gets strings
    player_data = player_data.withColumn(
        'age', player_data['age'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'weight_kg', player_data['weight_kg'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'overall', player_data['overall'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'pace', player_data['pace'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'passing', player_data['passing'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'physic', player_data['physic'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'movement_agility',
        player_data['movement_agility'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'power_stamina',
        player_data['power_stamina'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'mentality_aggression',
        player_data['mentality_aggression'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'shooting', player_data['shooting'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'dribbling', player_data['dribbling'].cast(types.IntegerType()))
    player_data = player_data.withColumn(
        'defending', player_data['defending'].cast(types.IntegerType()))

    #Feature Engineering
    players_data1 = player_data.select(
        'age', 'weight_kg', 'nationality', 'club', 'overall', 'potential',
        'value_eur', 'wage_eur', 'movement_agility', 'power_stamina',
        'mentality_aggression', 'pace', 'physic', 'passing', 'shooting',
        'defending', 'dribbling')
    players_data1 = players_data1.dropna()
    players_data2 = players_data1.drop('club', 'wage_eur')
    players_data2 = players_data2.withColumn('value_range', \
                         functions.when((functions.col('value_eur').between(10000, 200000)), 1) \
                         .when((functions.col('value_eur').between(200000, 400000)), 2) \
                         .when((functions.col('value_eur').between(400000, 600000)), 3) \
                         .when((functions.col('value_eur').between(600000, 800000)), 4) \
                         .when((functions.col('value_eur').between(800000, 1000000)), 5) \
                         .otherwise(0))

    #ML
    train, validation = players_data2.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    feature_vector = VectorAssembler(inputCols=[
        'age', 'weight_kg', 'overall', 'pace', 'passing', 'physic',
        'movement_agility', 'power_stamina', 'mentality_aggression', 'passing',
        'shooting', 'defending', 'dribbling'
    ],
                                     outputCol='features')
    classifier = MultilayerPerceptronClassifier(layers=[13, 130, 6],
                                                featuresCol='features',
                                                labelCol='value_range',
                                                maxIter=500)
    ml_pipeline = Pipeline(stages=[feature_vector, classifier])
    model = ml_pipeline.fit(train)
    model.write().overwrite().save('wage_modeller')

    prediction = model.transform(validation)
    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='value_range',
                                                  metricName='f1')
    score = evaluator.evaluate(prediction)
    print('Validation score for new player wages: %g' % (score, ))
"""-------------------------------------------------------------------------------------------------"""
"""MULTILAYER PERCEPTRON MLP CLASSIFIER """

from pyspark.ml.classification import MultilayerPerceptronClassifier
#We will use cross validation to find the optimal hyperparameters
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
#mlpParamGrid = ParamGridBuilder()\
#        .addGrid(MultilayerPerceptronClassifier.maxIter,[100, 200, 500])\
#        .addGrid(MultilayerPerceptronClassifier.blockSize,[10,20,30])\
#        .addGrid(MultilayerPerceptronClassifier.layers, [[3,6,6,3],[3,20,20,3],[3,100,100,3]])\
#        .build()
layers = [3, 20, 20, 3]
#mlpCrossval = CrossValidator(estimator=MultilayerPerceptronClassifier(layers=layers,labelCol="label",featuresCol="pcaFeatures", solver = "l-bfgs", seed = 1234), estimatorParamMaps = mlpParamGrid, evaluator=MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label",metricName="accuracy"), numFolds = 5)
mlp = MultilayerPerceptronClassifier(blockSize=10,
                                     layers=layers,
                                     labelCol="label",
                                     featuresCol="pcaFeatures",
                                     solver="l-bfgs",
                                     seed=1234)
#create the model
import time
mlp_start = time.time()
mlpModel = mlp.fit(trainingData)
mlp_end = time.time()
print("MLP Classifier")
print()
print()

#Predict on the test data
mlppredictions = mlpModel.transform(testData)
mlppredictions.select("prediction", "variety", "label").collect()
Esempio n. 21
0
        .appName("Spark ML KMEANS with dataframes") \
        .master("local[4]") \
        .getOrCreate()

    data_frame = spark_session \
        .read \
        .format("libsvm") \
        .load("data/wine.scale.txt")

    data_frame.printSchema()
    data_frame.show()

    (training_data, test_data) = data_frame.randomSplit([0.8, 0.2])

    naiveBayes = NaiveBayes(modelType="gaussian")
    perceptron = MultilayerPerceptronClassifier(seed=123)

    paramGrid_old = ParamGridBuilder() \
        .addGrid(NaiveBayes.smoothing, [0.05, 0.0, 0.1, 0.2, 0.5]) \
        .build()

    paramGrid = ParamGridBuilder() \
        .addGrid(perceptron.maxIter, [10, 30, 100, 500])\
        .addGrid(perceptron.layers, [[13, 7, 5, 3], [13, 8, 4, 5, 3]])\
        .build()

    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    crossval_old = CrossValidator(estimator=naiveBayes,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5)
Esempio n. 22
0
    def _train_model_spark(self, data):
        df = self._prepare_data_spark(data)
        input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                                self.TODAY_PRICE}))

        if self.ann_hidden_nodes_num is None:
            self.ann_hidden_nodes_num = input_num / 2 + 1
        ann_layers = [input_num,
                      # input_num / 3 * 2,
                      # input_num / 3,
                      self.ann_hidden_nodes_num,
                      2]

        self.logger.info('layer settings are {}'.format(ann_layers))
        self.logger.info('training method is {}'.format(self._train_method))
        self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
        if isinstance(self._train_method, dict):
            if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                self._model[self.CHANGE_AMOUNT].stop_server()
            self._model = {self.CHANGE_AMOUNT: None,
                           self.CHANGE_DIRECTION: None}

            if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                      maxIter=self.linear_regression_training_times,
                                      regParam=self.linear_regression_regularization_parameter,
                                      predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = lr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth,
                                            predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                                          num_workers=self.spark_worker_numbers,
                                                                          epoch=self.ann_epoch_number,
                                                                          featuresCol="features",
                                                                          labelCol=self.CHANGE_AMOUNT,
                                                                          predictionCol='AmountPrediction'
                                                                          )
                self._model[self.CHANGE_AMOUNT].fit(df)
            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

            if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
                lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                        maxIter=self.logistic_regression_training_times,
                                        regParam=self.linear_regression_regularization_parameter,
                                        predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = lr.fit(df)
            elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
                rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                             numTrees=self.random_forest_tree_number,
                                             maxDepth=self.random_forest_tree_max_depth,
                                             predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = rfc.fit(df)

            elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 2
                mlpc = MultilayerPerceptronClassifier(featuresCol="features",
                                                      labelCol=self.CHANGE_DIRECTION,
                                                      layers=ann_layers,
                                                      predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        else:
            if self._train_method == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
                                      regParam=self.linear_regression_regularization_parameter,
                                      maxIter=self.linear_regression_training_times)
                self._model = lr.fit(df)
            elif self._train_method == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
                                            predictionCol='prediction',
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth)
                self._model = rfr.fit(df)

            elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                if self._model is not None:
                    self._model.stop_server()
                self.logger.warn('layers are {}'.format(ann_layers))
                self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                      num_workers=self.spark_worker_numbers, epoch=100,
                                                      featuresCol="features", labelCol=self.TARGET_PRICE,
                                                      predictionCol='prediction'
                                                      )
                self._model.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        return self._model
Esempio n. 23
0
    # Create PCA model (reduce to 6 principal componentes)
    pca = PCA(k=6, inputCol="baseFeatures", outputCol="features")

    timerstart = timeit.default_timer()

    # Reduce assembled data
    model = pca.fit(assembledData)
    reducedData = model.transform(assembledData).select("features", "label")

    # Specify layers for the neural network:
    # Input layer of size 43 (features). a hidden layer of size 23 and output of size 2 (classes)
    layers = [6, 4, 2]

    # Create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=50, layers=layers)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = reducedData.randomSplit([0.7, 0.3])  # Using PCA
    #(trainingData, testData) = assembledData.randomSplit([0.7, 0.3]) # Not using PCA

    # Train model
    model = trainer.fit(trainingData)

    timerend = timeit.default_timer()

    # Make predictions
    predictions = model.transform(testData)

    # Select (prediction, true label) and compute metrics
    f1 = MulticlassClassificationEvaluator(
# List of dataframes for each of the experts
dataframes = train_data.randomSplit([1.0 for x in range(num_of_experts)],
                                    seed=1234)

# Get the models for each expert using the parameters of the best model defined above
print("Generating and training experts...")
start = time.time()
for expert in range(num_of_experts):

    train_data_experts, test_data_experts = dataframes[expert].randomSplit(
        [0.8, 0.2])

    trainer = MultilayerPerceptronClassifier(maxIter=iters,
                                             layers=layers,
                                             stepSize=lr,
                                             blockSize=128,
                                             seed=1234)
    model = trainer.fit(train_data_experts)
    dict_of_models[expert] = model

# Dictionary to store the predictions of the full dataset for each trained expert
dict_of_predictions = dict()

# Iterate through the expert and predict the values of each dataset
print("Generating predictions...")
for expert in range(num_of_experts):
    dict_of_predictions[expert] = dict_of_models[expert].transform(test_data)

# Create a pandas dataframe whose columns are each predictions of each expert
evaluations = pd.concat([
Esempio n. 25
0
    ###########################################################################
    #########                    Training and Test                    #########

    print("\n======================================================= ")
    print("==================== NEURAL NETWORK =================== ")
    print("=======================================================\n")

    print("\n================== Training ===================\n")

    #training model MLP
    num_cols = rescaledData.select(
        'features').collect()[0].features.size  #vocabulary size
    layers = [num_cols, 100, 2]
    trainer_MLP = MultilayerPerceptronClassifier(maxIter=100,
                                                 layers=layers,
                                                 blockSize=128,
                                                 seed=1234)
    model_MLP = trainer_MLP.fit(rescaledData)
    print("Done : Neural Network Training")

    print("\n========= Test on Brexit labeled data =========\n ")

    #MLP
    result_MLP = model_MLP.transform(rescaled_test_df_brexit)
    predictionAndLabels = result_MLP.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    accuracy_MLP = evaluator.evaluate(predictionAndLabels)
    print("Accuracy MLP = " + str(accuracy_MLP))

    file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n")
    file.write('-> ACCURACY MLP : ' + str(accuracy_MLP) + '\n')
Esempio n. 26
0
# Dividimos el dataset en train y test
splits = dataset.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

# Especificamos las capas que tiene la red neuronal (Zeek layers = [9, 9, 9, 10])
# (Argus layers = [15,21,12,10,13,10] o layers = [16,21,12,10,13,10])
layers = [9, 9, 9, 10]


# Creamos el modelo de red neuronal, lo entrenamos, lo guardamos y realizamos la prediccion
now = datetime.datetime.now()
print (now.year, now.month, now.day, now.hour, now.minute, now.second)

mpc = MultilayerPerceptronClassifier(layers=layers, labelCol='attack_cat_index', featuresCol='features', seed=1234,
                                     predictionCol='prediction')
mpc = mpc.fit(train)
model_output_path = "{}/data/NeuralNetwork.bin".format( base_path)
mpc.write().overwrite().save(model_output_path)

now = datetime.datetime.now()
print (now.year, now.month, now.day, now.hour, now.minute, now.second)

result = mpc.transform(test)

#Creamos una funcion para el TPR
prediction_list = result.select("attack_cat_index", "prediction").toPandas()[["attack_cat_index","prediction"]].values.tolist()
def truePositiveRate(list, label):
    tot_count = 0
    true_count = 0
    for a in list:
Esempio n. 27
0
# COMPARE TO LOGISTIC REGRESSION
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.5, elasticNetParam=0.8, \
       labelCol="indexed", featuresCol="pcaFeatures")
lrModel = lr.fit(trainingData)
#Predict on the test data
lrPredictions = lrModel.transform(testData)
lrPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect()
evaluator.evaluate(lrPredictions)

# COMPARE TO NEURAL NETWORK MULTILAYER PERCEPTRON
from pyspark.ml.classification import MultilayerPerceptronClassifier
layers = [3, 25, 25, 2]
# layers = [input_dim, internal layers, output_dim(number of classe) ]
nn = MultilayerPerceptronClassifier(maxIter=100, \
        layers=layers, \
    blockSize=128, seed=124, labelCol="indexed", \
    featuresCol="pcaFeatures")
nnModel = nn.fit(trainingData)
#Predict on the test data
nnPredictions = nnModel.transform(testData)
nnPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect()
evaluator.evaluate(nnPredictions)
"""--------------------------------------------------
Modify the code above to:
    - train a logistic regression with the original vars (5% significant p-value)
    - from the selected vars above, train 2 logistic models with regParam = [0.01 and 0.5]
    - train 2 random forest (number of trees = 10 and 100)
    - compare results
"""

#Create the model
Esempio n. 28
0
output_path = args.output

train_datafile = 'hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Train-label-28x28.csv'
test_datafile = 'hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Test-label-28x28.csv'

train_data = spark.read.csv(train_datafile, header=False, inferSchema="true")
test_data = spark.read.csv(test_datafile, header=False,
                           inferSchema="true").repartition(16)

#Assembler
assembler = VectorAssembler(inputCols=train_data.columns[1:],
                            outputCol="features")

#MLP_trainer
layers = np.array(args.hiddenLayerSize.split(','), dtype=int)
trainer = MultilayerPerceptronClassifier(labelCol="_c0",featuresCol='features', \
                                         maxIter=100, layers=layers, blockSize=128,seed=1234)
#pipeline
pipeline = Pipeline(stages=[assembler, trainer])
pipelineFit = pipeline.fit(train_data)

prediction = pipelineFit.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="_c0",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)

print("Predictions accuracy = %g, Test Error = %g" % (accuracy,
                                                      (1.0 - accuracy)))
    data = sqlContext.read.format("libsvm")\
        .load("data/mllib/sample_multiclass_classification_data.txt")
    # Split the data into train and test
    
    data.show() 
    data.printSchema()
    data.select('features').show()
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    print (train.count())
    train.show()
    test = splits[1]
    
    
    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    sc.stop()
Esempio n. 30
0
#Creating the combined DataFrame for Phase 3
def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)


finalDF = unionAll(tpDF, fp1DF, fp2DF)
finalDF.count()

#Training and Evalaution of Phase 3

(TData, TstData) = finalDF.randomSplit([0.7, 0.3])

layers = [28, 29, 30, 2]
trainer1 = MultilayerPerceptronClassifier(maxIter=100,
                                          layers=layers,
                                          blockSize=128,
                                          seed=1234)

pipeline4 = Pipeline(stages=[trainer1])
model4 = pipeline4.fit(TData)

predict4 = model4.transform(TstData)

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predict4)
print("Accuracy = %g" % (accuracy))

predictionAndLabels = predict4.select("prediction", "label")
predictionAndLabels.rdd.take(2)
# Subtracting 'train' from original 'customer_complaints_DF' to get test set
test = customer_complaints_DF.subtract(train)
# Checking distributions of all labels in train and test sets after sampling
train.groupBy("label").count().show()
test.groupBy("label").count().show()

train = train.cache()

# specify layers for the neural network:
# input layer of size lexicon_size (features), one intermediate of size (lexicon_size+13)//2
# and output of size 18 (classes)
layers = [lexicon_size, (lexicon_size + 13) // 2, 13]

# Orismos montelou
trainer = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         blockSize=128,
                                         seed=1234)

# Ekpaideusi sto train set kai aksiologisi sto test set
# Fit the model
start_time = time.time()

model = trainer.fit(train)

time_cache = time.time() - start_time

# compute accuracy on the test set
# Kanoume transform panw sto montelo to test set kai pairnoume mia nea stili sto test dataframe pou perilambanei ta predictions
result = result = model.transform(test)

# Kratame ta pragmatika labels kai ta predictions
Esempio n. 32
0
    test_set = df.subtract(train_set)

    # Get number of documents for each set
    print('\n\nSize of train set: ', train_set.count(), '\n\n')
    print('\n\nSize of test set: ', test_set.count(), '\n\n')

    # Samples per Category for each set
    train_set.groupBy('category').count().show()
    test_set.groupBy('category').count().show()

    # input layer:k size, output layer:unique_cat size
    layers = [k, 200, len(uniq)]

    # Trainer
    trainer = MultilayerPerceptronClassifier(maxIter=100,
                                             layers=layers,
                                             blockSize=64,
                                             seed=seed)

    start_time = time.time()

    # Train the model
    model = trainer.fit(train_set)
    print('\n\n--- Time Elapsed for Training: {:0.2f} seconds ---\n\n'.format(
        time.time() - start_time))

    # compute accuracy on the test set
    result = model.transform(test_set)
    predictionAndLabels = result.select('prediction', 'label')
    evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    print('\nTest set accuracy = {:0.2f} %\n'.format(
        evaluator.evaluate(predictionAndLabels) * 100))
Esempio n. 33
0
testData = scalerModel.transform(testData)

#pdb.set_trace()
#model init
lr = LogisticRegression(featuresCol='scaledFeatures',
                        maxIter=100,
                        regParam=0.3,
                        elasticNetParam=0.8,
                        tol=0.0001,
                        family="binomial")
dt = DecisionTreeClassifier(featuresCol='scaledFeatures', seed=seed)
rf = RandomForestClassifier(featuresCol='scaledFeatures', seed=seed)
GBDT = GBTClassifier(featuresCol='scaledFeatures', seed=seed)
layers = [feature_number, 10, 5, 2]
mlp = MultilayerPerceptronClassifier(featuresCol='scaledFeatures',
                                     layers=layers,
                                     seed=seed)
svm = LinearSVC(featuresCol='scaledFeatures', regParam=0.1)
nb = NaiveBayes(featuresCol='scaledFeatures', smoothing=1.0)


#model training and testing functions
def LR(trainingData, testData):

    Model = lr.fit(trainingData)
    results = Model.transform(testData)

    label = results.select("label").toPandas().values
    predict = results.select("prediction").toPandas().values
    np.savetxt('res/predictedLR_spark.txt', predict, fmt='%01d')
    print("[accuracy,precision,recall,f1]")
Esempio n. 34
0
def main(argv):

    # Name of prediction column
    label = argv[1]

    start = time.time()

    spark = SparkSession.builder \
                        .master("local[*]") \
                        .appName("datasetClassifier") \
                        .getOrCreate()

    data = spark.read.parquet(argv[0]).cache()

    vector = data.first()["features"]
    featureCount = len(vector)

    print(f"Feature count    : {featureCount}")
    classCount = int(data.select(label).distinct().count())
    print(f"Class count    : {classCount}")
    print(f"Dataset size (unbalanced)    : {data.count()}")
    data.groupby(label).count().show(classCount)

    data = datasetBalancer.downsample(data, label, 1)

    print(f"Dataset size (balanced)  : {data.count()}")
    data.groupby(label).count().show(classCount)

    testFraction = 0.3
    seed = 123

    # DecisionTree
    dtc = DecisionTreeClassifier()
    mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k, v in matrics.items():
        print(f"{k}\t{v}")

    # RandomForest
    rfc = RandomForestClassifier()
    mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k, v in matrics.items():
        print(f"{k}\t{v}")

    # LogisticRegression
    lr = LogisticRegression()
    mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k, v in matrics.items():
        print(f"{k}\t{v}")

    # MultilayerPerceptronClassifier
    layers = [featureCount, 10, classCount]
    mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(200)
    mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k, v in matrics.items():
        print(f"{k}\t{v}")

    end = time.time()
    print("Time: %f  sec." % (end - start))
                    predictionsPath)
            spark.stop()

            data["predictionsPath"] = predictionsPath

        elif config["estimatorType"] == "mpc":
            train, test = spark.read.parquet(
                data["currentTrain"]), spark.read.parquet(data["currentTest"])

            train.cache()
            test.cache()

            classifier = MultilayerPerceptronClassifier(
                featuresCol=config["featuresCol"],
                labelCol=config["labelCol"],
                maxIter=config["maxIter"],
                layers=[int(x.strip()) for x in config["layers"].split(",")],
                blockSize=config["blockSize"],
                seed=config["seed"])

            # Fit the model
            model = classifier.fit(train)

            predictions = model.transform(test)

            predictionsPath = data['scheme'] + "://" + data[
                'save'] + "/predictions/"

            if "partitionCol" in data and data[
                    'partitionCol'] in predictions.schema.names:
                test.write.partitionBy(data['partitionCol']).format(
Esempio n. 36
0
def mpc_core(df, condition):
    """
    mpc多分类核心函数
    :param df:
    :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":True}
    :return:
    """
    {
        "label": "标签",
        "features": ["数量", "折扣", "利润", "装运成本"],
        "iterations": 20,
        "seed": 1,
        "layers": [4, 2, 2],
        "stepSize": 0.03,
        "tol": 0.000001,
        "blockSize": 128,
        "solver": "l-bfgs"
    }

    # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs"
    label_index = condition['label']  # 标签列(列名或列号)
    feature_indexs = condition['features']  # 特征列(列名或列号)
    iterations = condition['iterations']  # 最大迭代次数(默认100)
    tol = condition['tol']  # 迭代算法的收敛容限(> = 0)(默认值:1e-06即 0.000001)
    seed = condition['seed']  # 随机种子
    layers = condition[
        'layers']  # Sizes of layers from input layer to output layer
    blockSize = condition[
        'blockSize']  # Block size for stacking input data in matrices.
    stepSize = condition['stepSize']  # 步长,默认值:0.03
    solver = condition['solver']  # 是否训练截距项(默认值:"l-bfgs","gd"可选)

    # 参数类型转换
    if isinstance(iterations, str):
        iterations = int(iterations)
    if isinstance(tol, str):
        tol = float(tol)
    if isinstance(seed, str):
        seed = int(seed)
    if isinstance(layers, list):
        for i in range(len(layers)):
            if isinstance(layers[i], str):
                layers[i] = int(layers[i])
    if isinstance(blockSize, str):
        blockSize = int(blockSize)
    if isinstance(stepSize, str):
        stepSize = float(stepSize)

    # 1. 准备数据
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=x[label_index], features=Vectors.dense(features_data))

    training_set = df.rdd.map(lambda x: func(x)).toDF()

    # 2.训练模型
    mpc_param = MultilayerPerceptronClassifier(maxIter=iterations,
                                               tol=tol,
                                               seed=seed,
                                               layers=layers,
                                               blockSize=blockSize,
                                               stepSize=stepSize,
                                               solver=solver)
    mpc_model = mpc_param.fit(training_set)

    # 3.保存模型
    mpc_model_path = model_url() + '/mpc/' + str(uuid.uuid1())
    deltree(mpc_model_path)  # 删除已经存在的模型
    mpc_model.write().overwrite().save(mpc_model_path)

    return mpc_model_path