Beispiel #1
0
"""--------------------------------------------------------------------------
Perform Machine Learning
-------------------------------------------------------------------------"""
#Split into training and testing data
(trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.show()

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
                featuresCol="features")
dtModel = dtClassifer.fit(trainingData)

dtModel.numNodes
dtModel.depth

#Predict on the test data
predictions = dtModel.transform(testData)
predictions.select("prediction", "species", "label").show()

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
evaluator.evaluate(predictions)

#Draw a confusion matrix
predictions.groupBy("label", "prediction").count().show()
Beispiel #2
0
# Vector assembler

df3 = VectorAssembler(inputCols=['Pclass','Gender1','Embarked2','Fare1','Age1'],outputCol='Features').transform(df3)
df3.show(truncate=False)

# data processing complete---

# 6 .Model building
training = df3
training1 = df3
training.show(truncate=False,n=5)

from pyspark.ml.classification import DecisionTreeClassifier

dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived') 
dtmodel1 = dt1.fit(training)
predictions = dtmodel1.transform(training)
predictions.select('Survived','rawPrediction','probability','prediction').show(n=5,truncate=False)

from pyspark.ml.classification import GBTClassifier

gbt1 = GBTClassifier(featuresCol='Features',labelCol='Survived',maxDepth=6,maxIter=20)
gbtmodel1 = gbt1.fit(training)
predictions = gbtmodel1.transform(training)

PredictionsandLabels = predictions.select('prediction','Survived').rdd

from pyspark.mllib.evaluation import MulticlassMetrics

metric1 = MulticlassMetrics(PredictionsandLabels)
metric1.accuracy
                                 hotel_clusterIndxr AS label
                                 , features
                             FROM dataset
                         ''')
dataset = dataset.drop('hotel_clusterIndxr')

# Write the cleased dataset to an s3 bucket in parquet format
dataset.write.parquet("s3://expedia-hotel-recommendations-workflow/spark_OutputCleasedDataset.parquet")


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3])

# Fit Decision Tree Algorithm
dtc = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dtcm = dtc.fit(trainingData)

# Save trained Logistic Regression Model to s3 Bucket for future use
dtcm.save('s3://expedia-hotel-recommendations-workflow/dtcm_model')

# Load Pre-Trained Logistic Regression Model to illistrate how model will be imported for future use
dtcModel = DecisionTreeClassificationModel.load("s3://expedia-hotel-recommendations-workflow/dtcm_model")

# Make predictions with Decision Tree model on the Test Dataset
dtcPredictions = dtcModel.transform(testData)

# Calculate and print Accuracy score for Decision Tree Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
dtcAccuracy = evaluator.evaluate(dtcPredictions)
print("Decision Tree accuracy Error = %g" % (dtcAccuracy))
Beispiel #4
0
vector_assembler = VectorAssembler(inputCols= vector_features_zeek, outputCol="features")
dataset = vector_assembler.transform(dataset)
dataset.show(25)

# Dividimos el dataset en train y test
splits = dataset.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

# Creamos el modelo de Decision Tree, lo entrenamos y realizamos la prediccion
now = datetime.datetime.now()
print (now.year, now.month, now.day, now.hour, now.minute, now.second)

dt = DecisionTreeClassifier(labelCol='attack_cat_index', featuresCol='features', impurity='entropy', seed=1234, maxBins=136, maxDepth=25)
dt = dt.fit(train)

now = datetime.datetime.now()
print (now.year, now.month, now.day, now.hour, now.minute, now.second)

result = dt.transform(test)

prediction_df = result.select("attack_cat_index", "prediction").toPandas()
prediction_list = prediction_df[["attack_cat_index","prediction"]].values.tolist()

# Evaluamos la prediccion
evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index", metricName="accuracy")
accuracy = evaluator.evaluate(result)
print("Accuracy = {}".format(accuracy))

evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index", metricName="weightedPrecision")
                              "label").select("label",
                                              col(" enum").alias("enum"),
                                              col(" hours").alias("hours"))
data = data.select(data.label.cast("double"), "education-num",
                   "hours-per-week")

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)

# Split data into training and test data set
training, test = data.select("label", "features").randomSplit([0.85, 0.15])

# Create Decision tree model and fit the model with training dataset
dt = DecisionTreeClassifier()
model = dt.fit(training)

# Generate prediction from test dataset
predictions = model.transform(test)

# Evuluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)
# Show model accuracy
print("Accuracy:", accuracy)

# Report
predictionAndLabels = predictions.select("label", "prediction").rdd
metrics = MulticlassMetrics(predictionAndLabels)
print("Confusion Matrix:", metrics.confusionMatrix())
print("Precision:", metrics.precision())
Beispiel #6
0
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('mytree').getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier,GBTClassifier,
                                    DecisionTreeClassifier)
data=spark.read.format('libsvm').load('sample_libsvm_data.txt')
# Vemos una tabla con la columna label y la columna features
data.show()
# Dividimos en datos de entramiento y datos de prueba
train_data,test_data=data.randomSplit([0.7,0.3])
dtc= DecisionTreeClassifier()
rfc= RandomForestClassifier(numTrees=100) # cuantos más árboles agreguemos mayor será el tiempo de cálculo
gbt= GBTClassifier()
# Ajustamos los tres modelos
dtc_model=dtc.fit(train_data)
rfc_model=rfc.fit(train_data)
gbt_model=gbt.fit(train_data)
# Transformamos los datos de prueba para obtener predicciones
dtc_preds= dtc_model.transform(test_data)
rfc_preds= rfc_model.transform(test_data)
gbt_preds= gbt_model.transform(test_data)
# Tenemos las columnas: label|features|rawPrediction|probability|prediction.
# prediction devolverá el actual label
# Para Decision Tree and Random Forest, rawPredictionCol = 'rawPrediction' por defecto
dtc_preds.show()
# Seguimos teniendo las columnas: abel|features|rawPrediction|probability|prediction.
rfc_preds.show()
# Ahora solo tendriamos las columnas : label|features|prediction=GBT, yo he obtenido las mismas que antes
gbt_preds.show()
gbt_preds.printSchema()
# Aunque este es un conjunto de datos de clasificación binario, si solo hago un evaluador de clasificación binario,
                      inferSchema=True,
                      header=True)

data.printSchema()

print(data.head())

data.describe().show()

assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],
                            outputCol="features")

output = assembler.transform(data)

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier

rfc = DecisionTreeClassifier(labelCol='Spoiled', featuresCol='features')

output.printSchema()

final_data = output.select('features', 'Spoiled')
final_data.head()

rfc_model = rfc.fit(final_data)

print(
    "-----------------feature importance --------------------------------------"
)
print(rfc_model.featureImportances)

spark.stop()
Beispiel #8
0
# COMMAND ----------

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

# Vectorize the features (all columns excluding the first one, Survived)
features = trainDF.columns[1:]
assembler = VectorAssembler(inputCols=features, outputCol="features")
assembledTrainDF = assembler.transform(trainDF)

# Train a decision tree, setting maxDepth parameter to 3
dtc = DecisionTreeClassifier(featuresCol="features",
                             labelCol="Survived",
                             maxDepth=2)
dtcModel = dtc.fit(assembledTrainDF)

# Print the constructed tree
print(dtcModel.toDebugString)

# COMMAND ----------

# Visualize the decision tree

display(dtcModel)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Evaluate Feature Importance
Beispiel #9
0
#%% not necessary here
data_train, data_test = data.randomSplit([.7, .3])
print(f'{data_train.count()}, {data_test.count()}')   # 335, 155

#%%
from pyspark.ml.classification import DecisionTreeClassifier, \
                                      RandomForestClassifier, \
                                      GBTClassifier

est_dt = DecisionTreeClassifier(featuresCol='features', labelCol='SpoiledIdx')
est_rf = RandomForestClassifier(featuresCol='features', labelCol='SpoiledIdx')
est_gb = GBTClassifier(featuresCol='features', labelCol='SpoiledIdx')

#%%
model_dt = est_dt.fit(data_train)
model_rf = est_rf.fit(data_train)
model_gb = est_gb.fit(data_train)
                                 
#%%
dir(model_dt)
model_dt.featureImportances
model_rf.featureImportances
model_gb.featureImportances

#%%                                  
vv = model_gb.featureImportances
dir(vv)
type(vv)
vv.values
Beispiel #10
0
+-----+-----------------------------------------------------------------------------+----------+
|label|features                                                                     |prediction|
+-----+-----------------------------------------------------------------------------+----------+
|0.0  |[1.0,1.0,1.0,31.0,-1.3333333333333333,3382.0,7293.5]                         |0.0       |
|0.0  |[1.0,1.0,1.0,31.0,-1.3333333333333333,9791.833333333334,7911.5]              |0.0       |
|0.0  |[1.0,1.0,1.0,31.0,1.3333333333333333,54426.333333333336,2635.5]              |1.0       |
|1.0  |[1.0,1.0,1.0,34.0,0.0,126839.16666666667,7687.0]                             |0.0       |
|1.0  |[1.0,1.0,1.0,34.0,1.3333333333333333,103421.66666666667,3367.1666666666665]  |1.0       |
|1.0  |[1.0,1.0,1.0,34.0,1.6666666666666667,115795.66666666667,5100.0]              |1.0       |
|1.0  |[1.0,1.0,1.0,35.0,-1.6666666666666667,575.6666666666666,0.0]                 |0.0       |
Random Forest accuracy :  0.807145257028
'''

#Decision Trees model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt_model = dt.fit(training_data)
#Predict on the test data
predictions = dt_model.transform(test_data)
result = predictions.select("label", "features", "prediction")
result.where('label = 1.0').show(20, False)
result.where('label = 0.0').show(20, False)
print 'Dicision Tree accuracy : ', evaluator.evaluate(predictions)
'''
Dicision Tree accuracy :  0.805577332288
'''

#Naive Bayes model
nb = NaiveBayes(labelCol="label", featuresCol="features")
nb_model = nb.fit(training_data)
#Predict on the test data
predictions = nb_model.transform(test_data)
Beispiel #11
0
df3 = df3.select(df3.Pclass.cast('double'), df3.Gender1, df3.Embarked2,
                 df3.Survived.cast('double'))
df3.printSchema()

# Vector assembler

df3 = VectorAssembler(inputCols=['Pclass', 'Gender1', 'Embarked2'],
                      outputCol='Features').transform(df3)
df3.show(truncate=False)
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='Survived')

# 2 learning process - created a model
model2 = dt1.fit(df3)
model2.depth
model2.numFeatures

# 3 get predictions

df5 = spark.read.csv('E:/kaggle/titanic/test.csv',
                     header=True).select('PassengerId', 'Sex', 'Pclass',
                                         'Embarked')

df5 = StringIndexer(inputCol='Embarked',
                    outputCol='Embarked1').fit(df5).transform(df5)
df5.show()

df5 = OneHotEncoder(inputCol='Embarked1',
                    outputCol='Embarked2',
Beispiel #12
0
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")

lr = LogisticRegression(maxIter=10, regParam=0.01)
#print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
model = lr.fit(final_train)
predictions = model.transform(final_test)
predictions.show()
accuracy = evaluator.evaluate(predictions)
print("LogisticRegression - Test set accuracy = " + str(accuracy))

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
#print("DecisionTreeClassifier parameters:\n" + dt.explainParams() + "\n")
model = dt.fit(final_train)
predictions = model.transform(final_test)
predictions.show()
accuracy = evaluator.evaluate(predictions)
print("DecisionTreeClassifier - Test set accuracy = " + str(accuracy))

rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            numTrees=10)
#print("RandomForestClassifier parameters:\n" + rf.explainParams() + "\n")
model = rf.fit(final_train)
predictions = model.transform(final_test)
predictions.show()
accuracy = evaluator.evaluate(predictions)
print("RandomForestClassifier - Test set accuracy = " + str(accuracy))
Beispiel #13
0
# MAGIC
# MAGIC You can read more about [Decision Trees](http://spark.apache.org/docs/latest/mllib-decision-tree.html) in the Spark MLLib Programming Guide.
# MAGIC The Decision Trees algorithm is popular because it handles categorical
# MAGIC data and works out of the box with multiclass classification tasks.

# COMMAND ----------

from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label",
                            featuresCol="features",
                            maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData)

# COMMAND ----------

# MAGIC %md
# MAGIC We can extract the number of nodes in our decision tree as well as the
# MAGIC tree depth of our model.

# COMMAND ----------

print("numNodes = ", dtModel.numNodes)
print("depth = ", dtModel.depth)

# COMMAND ----------

display(dtModel)
Beispiel #14
0
#String Indexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(dfUSD)
td = si_model.transform(dfUSD)
td.collect()
td.show()

#Splitting data
(trainingData, testData) = td.randomSplit([0.6, 0.4])
trainingData.count()
testData.count()
testData.collect()

#Creating decision tree model
dtClassifer = DecisionTreeClassifier(labelCol="indexed",minInstancesPerNode=1500)
dtModel = dtClassifer.fit(trainingData)
dtModel.numNodes
dtModel.depth

#Predict on the test data
predictions = dtModel.transform(trainingData)
predictions = dtModel.transform(testData)
predictions.select("prediction","indexed","label","features").show(10)

#Evaluation
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="precision")
evaluator.evaluate(predictions)

#Draw a confusion matrix
labelList=predictions.select("indexed","label").distinct().toPandas()
Beispiel #15
0
        .format("libsvm")\
        .load("data/classification.100000.4.txt")

    # 70% for training and 30% for testing
    (training_data, test_data) = data_frame.randomSplit([0.7, 0.3])

    # Calculate the time the machine spend before training
    time_before_training = time.time()
    print("Time before training: " + str(time_before_training - initial_time))

    # Create the  decision_tree model
    decision_tree = DecisionTreeClassifier(labelCol="label",
                                           featuresCol="features")

    # Fit the model
    model = decision_tree.fit(training_data)

    # Calculate the time the machine is training
    training_time = time.time()
    print("Training time: " + str(training_time - time_before_training))

    # Make the prediction
    prediction = model.transform(test_data)

    # Calculate the time the machine is testing with the model fitted
    testing_time = time.time()
    print("Test time: " + str(testing_time - training_time))

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
Beispiel #16
0
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean, col, split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer
indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index").fit(dfd)
    for column in ["1st Layer Clusters"]
]
pipeline = Pipeline(stages=indexers)
ddf = pipeline.fit(dfd).transform(dfd)
ddf = ddf.drop("1st Layer Clusters"
               )  #,"Name","Ticket","Cabin","Embarked","Sex","Initial")
# titanic_df.printSchema()
# from pyspark.sql.types import IntegerType
final = ddf.select([col(c).cast('int') for c in ddf.columns])
# final.printSchema()
feature = VectorAssembler(inputCols=final.columns[1:], outputCol="features")
feature_vector = feature.transform(final)
(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2], seed=11)
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="1st Layer Clusters_index",
                            featuresCol="features")
dt_model = dt.fit(trainingData)
dt_prediction = dt_model.transform(testData)
dt_prediction.select("prediction", "1st Layer Clusters_index",
                     "features").show()

# trainingData
#section 8.2.6
# OneVsRest is not available in Python.

#section 8.3.1
from pyspark.ml.feature import StringIndexer
dtsi = StringIndexer(inputCol="label", outputCol="label-ind")
dtsm = dtsi.fit(penlpoints)
pendtlpoints = dtsm.transform(penlpoints).drop("label").withColumnRenamed("label-ind", "label")

pendtsets = pendtlpoints.randomSplit([0.8, 0.2])
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=20)
dtmodel = dt.fit(pendttrain)

# rootNode is not accessible in Python

dtpredicts = dtmodel.transform(pendtvalid)
dtresrdd = dtpredicts.select("prediction", "label").map(lambda row:  (row.prediction, row.label))

from pyspark.mllib.evaluation import MulticlassMetrics
dtmm = MulticlassMetrics(dtresrdd)
dtmm.precision()
#0.951442968392121
print(dtmm.confusionMatrix())
#DenseMatrix([[ 205.,    0.,    3.,    0.,    0.,    3.,    1.,    0.,    0.,
#                 0.],
#             [   0.,  213.,    0.,    1.,    2.,    1.,    0.,    2.,    0.,
#                 2.],
Beispiel #18
0
df_test =scaler.transform(df_test)
df_test = df_test.drop('features').withColumnRenamed('scaledFeatures','features')


# train data
algo = DecisionTreeClassifier()
grid = ParamGridBuilder().build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=algo, estimatorParamMaps=grid, numFolds=10, evaluator=evaluator)
cv_model = cv.fit(df_train)
pred = cv_model.transform(df_test)
print("from {}, {} died. {}".format(pred.count(), pred.filter(pred.prediction == 0).count(),pred.filter(pred.prediction == 0).count()/pred.count()))

pred_csv = pred.toPandas()
pred_csv = pred_csv.filter(['PassengerId', 'prediction'])
pred_csv = pred_csv.rename(columns={'prediction':'Survived'})
pred_csv.to_csv (r'lr.csv', index = False, header=True)


data_train, data_test = df_train.randomSplit([0.8,0.2])
# train model logistic regression
algo_t = DecisionTreeClassifier()
model_t = algo_t.fit(data_train)
pat = model_t.transform(data_test)
print("from {}, {} died. {}".format(pat.count(), pat.filter(pat.prediction == 0).count(),pat.filter(pat.prediction == 0).count()/pat.count()))

evaloator = evaluation.MulticlassClassificationEvaluator(metricName='accuracy')
print("Accuracy DecisionTreeClassifier: {}".format(evaloator.evaluate((pat))))


#podział na zbiór treningowy i testowy
train, test = model_data.randomSplit([0.7, 0.3], seed=2018)

#statystyki danych w zbiorach testowym i treningowym
print("Liczba danych w zbiorze treningowym:", train.count())
print("Liczba danych w zbiorze testowym:", test.count())
print("Struktura w zbiorze treningowym:")
train.groupBy("label").count().show()
print("Struktura w zbiorze testowym:")
test.groupBy("label").count().show()

#drzewo decyzyjne
tstart = datetime.now()
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
dtmodel = dt.fit(train)
predictions = dtmodel.transform(test)
tend = datetime.now()
predictions.select('label', 'prediction', 'probability').show(10)
print("dt time", tend - tstart)

#obliczenie wartosci tp,tn,fp,fn
tp = predictions[(predictions.label == 1)
                 & (predictions.prediction == 1)].count()
tn = predictions[(predictions.label == 0)
                 & (predictions.prediction == 0)].count()
fp = predictions[(predictions.label == 0)
                 & (predictions.prediction == 1)].count()
fn = predictions[(predictions.label == 1)
                 & (predictions.prediction == 0)].count()
print("True Positives:", tp)
Beispiel #20
0
cars_assembled = assembler.transform(cars)

# Check the resulting column
kars = cars_assembled.select('features', 'origin_idx')
#kars.show(9)

# Split data into training and testing sets
kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23)

print(kars_train.count(), kars_test.count())

# Create a Decision Tree classifier
tree = DecisionTreeClassifier(labelCol="origin_idx")

# Learn from training data
tree = tree.fit(kars_train)

# Make predictions on testing data
prediction = tree.transform(kars_test)

prediction.show(9)

# Confusion matrix
confusion_matrix = prediction.groupBy("origin_idx", "prediction").count()
confusion_matrix.show()

# Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="origin_idx", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))
Beispiel #21
0
# COMMAND ----------

train_data, test_data = data.randomSplit([0.7, 0.3])

# COMMAND ----------

gbc_class = GBTClassifier()
rfc_class = RandomForestClassifier(numTrees=100)
dtc_class = DecisionTreeClassifier()

# COMMAND ----------

gbc_model = gbc_class.fit(train_data)
rfc_model = rfc_class.fit(train_data)
dtc_model = dtc_class.fit(train_data)

# COMMAND ----------

gbc_pred = gbc_model.transform(test_data)
rfc_pred = rfc_model.transform(test_data)
dtc_pred = dtc_model.transform(test_data)

# COMMAND ----------

#gbc_pred.show()

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
Beispiel #22
0
# COMMAND ----------

clean_data = cleaner.transform(data)

# COMMAND ----------

clean_data = clean_data.select('label', 'features')

# COMMAND ----------

train, test = clean_data.randomSplit([0.7, 0.3])

# COMMAND ----------

spam_detctor = dct.fit(train)

# COMMAND ----------

test_result = spam_detctor.transform(test)

# COMMAND ----------

test_result.show()

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# COMMAND ----------
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

# In[62]:

# Train DecisionTree model.

dt = DecisionTreeClassifier(featuresCol='minmax_scaled_features',
                            labelCol='class_attack')

# In[63]:

dtModel = dt.fit(conn_train)

# ## Make predictions on the test set

# In[64]:

# Run prediction on the whole dataset
conn_test_pred = dtModel.transform(conn_test)

# In[65]:

conn_test_pred.limit(3).toPandas()

# In[66]:

# Select example rows to display.
    passengersWithFilledEmptyValues = imputer.fit(passengers).transform(
        passengers)
    passengersWithFilledEmptyValues.show()  # look at first row

    #  Step - 4: Transform dataframe to vectorized dataframe
    output = assembler.transform(passengersWithFilledEmptyValues).select(
        "features", "survived"
    )  # <============== drop row if it has nulls/NaNs in the next list of columns)
    output.show()

    # Step - 5: Set up the Decision Tree Classifier
    trainer = DecisionTreeClassifier(labelCol="survived",
                                     featuresCol="features")

    # Step - 6: Train the model
    model = trainer.fit(output)

    # Step - 7: Predict with the model
    rawPredictions = model.transform(output)

    # Step - 8: Evaluate prediction
    evaluator = MulticlassClassificationEvaluator(labelCol="survived",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")

    # Step - 9: Calculate accuracy
    accuracy = evaluator.evaluate(rawPredictions)
    print("Test Error = %g " % (1.0 - accuracy))

    # Step - 10: Print out the model
    print(model.toDebugString)
# Check first five records
flights.show(5)

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    "mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration"
],
                            outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

# Check the resulting column
flights = flights_assembled.select('features', 'xdelay')

# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier(labelCol="xdelay")
tree_model = tree.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
predictions = prediction.select('xdelay', 'prediction', 'probability')

print(predictions.toPandas().sample(12))

spark.stop()
Beispiel #26
0
# OneVsRest is not available in Python.

#section 8.3.1
from pyspark.ml.feature import StringIndexer
dtsi = StringIndexer(inputCol="label", outputCol="label-ind")
dtsm = dtsi.fit(penlpoints)
pendtlpoints = dtsm.transform(penlpoints).drop("label").withColumnRenamed(
    "label-ind", "label")

pendtsets = pendtlpoints.randomSplit([0.8, 0.2])
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=20)
dtmodel = dt.fit(pendttrain)

# rootNode is not accessible in Python

dtpredicts = dtmodel.transform(pendtvalid)
dtresrdd = dtpredicts.select(
    "prediction", "label").rdd.map(lambda row: (row.prediction, row.label))

from pyspark.mllib.evaluation import MulticlassMetrics
dtmm = MulticlassMetrics(dtresrdd)
dtmm.precision()
#0.951442968392121
print(dtmm.confusionMatrix())
#DenseMatrix([[ 205.,    0.,    3.,    0.,    0.,    3.,    1.,    0.,    0.,
#                 0.],
#             [   0.,  213.,    0.,    1.,    2.,    1.,    0.,    2.,    0.,
Beispiel #27
0
def prepDataForML(df):
    # https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa
    cols = df.columns

    categoricalColumns = ['School']
    stages = []
    for categoricalCol in categoricalColumns:
        # indexes each categorical column using the StringIndexer
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol + 'Index')
        #  converts the indexed categories into one-hot encoded variables
        encoder = OneHotEncoderEstimator(
            inputCols=[stringIndexer.getOutputCol()],
            outputCols=[categoricalCol + "classVec"])
        stages += [stringIndexer, encoder]

    # StringIndexer again to encode our labels to label indices
    label_stringIdx = StringIndexer(inputCol='Drafted', outputCol='label')
    stages += [label_stringIdx]

    # VectorAssembler to combine all the feature columns into a single vector column
    numericCols = [
        'Games Played', 'Att (rushing)', 'Yds (rushing)', 'Avg (rushing)',
        'TD (rushing)', 'Rec (receiving)', 'Yds (receiving)',
        'Avg (receiving)', 'TD (receiving)', 'Plays (scrimmage)',
        'Yds (scrimmage)', 'Avg (scrimmage)', 'TD (scrimmage)', 'Year'
    ]
    assemblerInputs = [c + "classVec"
                       for c in categoricalColumns] + numericCols
    assembler = VectorAssembler(inputCols=assemblerInputs,
                                outputCol="features")
    stages += [assembler]

    # PipeLine for the ML data to follow
    pipeline = Pipeline(stages=stages)
    pipelineModel = pipeline.fit(df)
    df = pipelineModel.transform(df)
    selectedCols = ['label', 'features'] + cols
    df = df.select(selectedCols)
    #df.printSchema()
    #df.show()

    # Randomly split data into train and test sets, and set seed for reproducibility.
    train, test = df.randomSplit([0.7, 0.3], seed=2018)
    train.cache()
    test.cache()
    # print("Training Dataset Count: " + str(train.count()))
    # print("Test Dataset Count: " + str(test.count()))

    # Apply machine learing to it

    # Logistic Regression --> ROC: 85%, Accuracy: 88.73239436619719%
    # lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
    # lrModel = lr.fit(train)
    # predictions = lrModel.transform(test)
    # evaluator = BinaryClassificationEvaluator()
    # print('Test Area Under ROC', evaluator.evaluate(predictions) * 100)
    # evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    # accuracy = evaluator.evaluate(predictions)
    # print("Test set accuracy = " + str(accuracy))

    # Random Forest --> ROC: 83.90126725368875%, Accuracy: 90.02347417840375%
    # rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
    # rfModel = rf.fit(train)
    # predictions = rfModel.transform(test)
    # # predictions.show(1000)
    # evaluator = BinaryClassificationEvaluator()
    # print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))
    # evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    # accuracy = evaluator.evaluate(predictions)
    # print("Test set accuracy = " + str(accuracy))

    # Linear SVM --> ROC: 77.35899571632558, Accuracy: 89.67136150234741%
    # lsvc = LinearSVC(maxIter=10, regParam=0.1)
    # lsvcModel = lsvc.fit(train)
    # predictions = lsvcModel.transform(test)
    # evaluator = BinaryClassificationEvaluator()
    # print('Test Area Under ROC', evaluator.evaluate(predictions) * 100)
    # evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    # accuracy = evaluator.evaluate(predictions)
    # print("Test set accuracy = " + str(accuracy))

    # Decision Tree --> Area Under ROC: 83.31151832460733, Accuracy: 86.97183098591549%
    dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
    dtModel = dt.fit(train)

    # Cross Vaildation

    predictions = dtModel.transform(test)
    evaluator = BinaryClassificationEvaluator()
    print('Test Area Under ROC', evaluator.evaluate(predictions) * 100)
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test set accuracy = " + str(accuracy))
Beispiel #28
0
# selected features of preprocessed datasets for modeling
allFeatures = ['salutation_Vec','newsletter','model_Vec','paymenttype_Vec','voucher','case','numberitems','gift','entry','shippingcosts','weight','remi','cancel','used','w0','w1','w2','w3','w4','w5','w6','w7','w8','w9','w10','books','nobooks','itemseff']
assembler = VectorAssembler(inputCols=allFeatures, outputCol='features')
dfTrain = assembler.transform(dfTrain)
dfTest = assembler.transform(dfTest)

# Training + Prediction
###################################

revenues = [] # List of generated results

#### Decision Tree Classifier
start_time = time()
tree = DecisionTreeClassifier()
tree_model = tree.fit(dfTrain)
predictionTree = tree_model.transform(dfTest)
end_time = time()
elapsed_time = end_time - start_time
print("Time to train Tree on dfTrain and make predictions on dfTest: %.3f seconds" % elapsed_time)

# Evaluation
predictionTree.groupBy("label", "prediction").count().show()
start_time = time()
#revenue = costMatrix(predictionTree)
tn = predictionTree[(predictionTree.label == 0) & (predictionTree.prediction == 0.0)].count()
fn = predictionTree[(predictionTree.label == 1) & (predictionTree.prediction == 0.0)].count()
revenue = (tn * 1.5 - fn * 5) #revenue based on costMatrix
revenues.append(revenue)
end_time = time()
elapsed_time = end_time - start_time
Beispiel #29
0
df3.show(truncate=False)

training = df3
training1 = df3

training.show(truncate=False, n=5)

# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features',
                             labelCol='Survived',
                             seed=5000)

# 2 learning process - created a model

model2 = dt1.fit(training)
model2.depth
model2.numFeatures

training1 = model2.transform(training)
training1.show(5)

PredictionandLabels = training1.select(training1.prediction,
                                       training1.Survived).rdd
PredictionandLabels.collect()

from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics
# metrics1 = BinaryClassificationMetrics(PredictionandLabels)
# (train score/train accuracy   --- )
# (train error = 1-train score ?)
metrics2 = MulticlassMetrics(PredictionandLabels)
summary = lrModel.summary
print summary.areaUnderROC
summary.roc.show()
summary.pr.show()

# COMMAND ----------

summary.objectiveHistory

# COMMAND ----------

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
print dt.explainParams()
dtModel = dt.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier()
print rfClassifier.explainParams()
trainedModel = rfClassifier.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import GBTClassifier
gbtClassifier = GBTClassifier()
print gbtClassifier.explainParams()
trainedModel = gbtClassifier.fit(bInput)
Beispiel #31
0

################### DD. Decision Tree ####################

# 10.0 Call library
from pyspark.ml.classification import DecisionTreeClassifier

# 10.1 Instantiate modeling object:

dt = DecisionTreeClassifier(featuresCol = 'features',
                                                            labelCol = 'label', 
                                                            maxDepth = 3
                                                            )

# 10.2 Fit/train model
dtModel = dt.fit(train)

# 10.3 MAke predictions and evaluate
predictions = dtModel.transform(test)
predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

# 10.4
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

################ EE. Random Forest Classification ##################

# 11.0 Import library
from pyspark.ml.classification import RandomForestClassifier

# 11.1 Instantiate object and fit:
Beispiel #32
0
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(predictionCol="Predicted_median",labelCol="label",featuresCol="features",numTrees = 40, maxDepth = 30)
rfModel = rf.fit(Training_set)
rf_predictions = rfModel.transform(Test_set)
rf_predictions.filter(rf_predictions['Predicted_median'] == 0) \
           .select("features","label","Predicted_median","probability") \
           .orderBy("probability", ascending=False) \
           .show(n = 10, truncate = 30)
evaluator = BinaryClassificationEvaluator()
print("Test_SET (Area Under ROC): " + str(evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"})))

## DECISION TREES
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(predictionCol='Predicted_median',featuresCol = 'features', labelCol = 'label',maxDepth = 30)
dtModel = dt.fit(Training_set)
dt_predictions = dtModel.transform(Test_set)
dt_predictions.select('features', 'label', 'rawPrediction', 'Predicted_median', 'probability').show(10)
evaluator = BinaryClassificationEvaluator()
print("Test_SET (Area Under ROC): " + str(evaluator.evaluate(dt_predictions, {evaluator.metricName: "areaUnderROC"})))

# LETS TRY CROSS VALIDATION ON THE GRADIENT BOOSTING MODEL TO SEE IF THE PERFORMANCE IMPROVES.

#GRADIENT BOOSTING WITH CROSS VALIDATION
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder().addGrid(gb.maxDepth, [2, 4, 10]).addGrid(gb.maxBins, [10, 20]).addGrid(gb.maxIter, [10, 25]).build())
cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
# Run cross validations.
cvModel = cv.fit(Training_set)
gb_cv_predictions = cvModel.transform(Test_set)
gb_cv_predictions.select('features', 'label', 'rawPrediction', 'Predicted_median', 'probability').show(10)
# MAGIC %md
# MAGIC ####Decision Trees
# MAGIC You can read more about Decision Trees from the Programming Guide [here](http://spark.apache.org/docs/latest/mllib-decision-tree.html).
# MAGIC 
# MAGIC Decision Trees is a popular algorithm as it can handle categorical data and work with multiclass data.

# COMMAND ----------

from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData)

# COMMAND ----------

# MAGIC %md We can extract the number of nodes in our decision tree as well as the tree depth of our model.

# COMMAND ----------

print "numNodes = ", dtModel.numNodes
print "depth = ", dtModel.depth

# COMMAND ----------

# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testData)
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review','label','target_indexed').show()



#**********************************************************************
#-----------Training the model for prediction--------------------------
#**********************************************************************


from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(),labelCol=string_indexer.getOutputCol())
dt_model = dt.fit(dfTrainFinal)



# On applique le même à notre ensemble de test ridicule.
# En théorie le pipeline permet d'automatiser tout ça mais bon, on s'en servira probablement pas

# EDIT : en fait c'est plutot facile de créer des transformers à partir de chaque étape, donc peut 
# être que les pipelines c'est faisables. A voir
df_test_words = tokenizer.transform(dfTest)
df_test_tf = htf.transform(df_test_words)
df_test_tfidf = idfModel.transform(df_test_tf)
df_test_final = string_indexer_model.transform(df_test_tfidf)
# Les prédictions
df_test_pred = dt_model.transform(df_test_final)
df_test_pred.select('review', 'target_indexed', 'prediction', 'probability').show(5)