"""-------------------------------------------------------------------------- Perform Machine Learning -------------------------------------------------------------------------""" #Split into training and testing data (trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1]) trainingData.count() testData.count() testData.show() from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator #Create the model dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\ featuresCol="features") dtModel = dtClassifer.fit(trainingData) dtModel.numNodes dtModel.depth #Predict on the test data predictions = dtModel.transform(testData) predictions.select("prediction", "species", "label").show() #Evaluate accuracy evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \ labelCol="label",metricName="accuracy") evaluator.evaluate(predictions) #Draw a confusion matrix predictions.groupBy("label", "prediction").count().show()
# Vector assembler df3 = VectorAssembler(inputCols=['Pclass','Gender1','Embarked2','Fare1','Age1'],outputCol='Features').transform(df3) df3.show(truncate=False) # data processing complete--- # 6 .Model building training = df3 training1 = df3 training.show(truncate=False,n=5) from pyspark.ml.classification import DecisionTreeClassifier dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived') dtmodel1 = dt1.fit(training) predictions = dtmodel1.transform(training) predictions.select('Survived','rawPrediction','probability','prediction').show(n=5,truncate=False) from pyspark.ml.classification import GBTClassifier gbt1 = GBTClassifier(featuresCol='Features',labelCol='Survived',maxDepth=6,maxIter=20) gbtmodel1 = gbt1.fit(training) predictions = gbtmodel1.transform(training) PredictionsandLabels = predictions.select('prediction','Survived').rdd from pyspark.mllib.evaluation import MulticlassMetrics metric1 = MulticlassMetrics(PredictionsandLabels) metric1.accuracy
hotel_clusterIndxr AS label , features FROM dataset ''') dataset = dataset.drop('hotel_clusterIndxr') # Write the cleased dataset to an s3 bucket in parquet format dataset.write.parquet("s3://expedia-hotel-recommendations-workflow/spark_OutputCleasedDataset.parquet") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = dataset.randomSplit([0.7, 0.3]) # Fit Decision Tree Algorithm dtc = DecisionTreeClassifier(labelCol="label", featuresCol="features") dtcm = dtc.fit(trainingData) # Save trained Logistic Regression Model to s3 Bucket for future use dtcm.save('s3://expedia-hotel-recommendations-workflow/dtcm_model') # Load Pre-Trained Logistic Regression Model to illistrate how model will be imported for future use dtcModel = DecisionTreeClassificationModel.load("s3://expedia-hotel-recommendations-workflow/dtcm_model") # Make predictions with Decision Tree model on the Test Dataset dtcPredictions = dtcModel.transform(testData) # Calculate and print Accuracy score for Decision Tree Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") dtcAccuracy = evaluator.evaluate(dtcPredictions) print("Decision Tree accuracy Error = %g" % (dtcAccuracy))
vector_assembler = VectorAssembler(inputCols= vector_features_zeek, outputCol="features") dataset = vector_assembler.transform(dataset) dataset.show(25) # Dividimos el dataset en train y test splits = dataset.randomSplit([0.7, 0.3], 1234) train = splits[0] test = splits[1] # Creamos el modelo de Decision Tree, lo entrenamos y realizamos la prediccion now = datetime.datetime.now() print (now.year, now.month, now.day, now.hour, now.minute, now.second) dt = DecisionTreeClassifier(labelCol='attack_cat_index', featuresCol='features', impurity='entropy', seed=1234, maxBins=136, maxDepth=25) dt = dt.fit(train) now = datetime.datetime.now() print (now.year, now.month, now.day, now.hour, now.minute, now.second) result = dt.transform(test) prediction_df = result.select("attack_cat_index", "prediction").toPandas() prediction_list = prediction_df[["attack_cat_index","prediction"]].values.tolist() # Evaluamos la prediccion evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index", metricName="accuracy") accuracy = evaluator.evaluate(result) print("Accuracy = {}".format(accuracy)) evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index", metricName="weightedPrecision")
"label").select("label", col(" enum").alias("enum"), col(" hours").alias("hours")) data = data.select(data.label.cast("double"), "education-num", "hours-per-week") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) # Split data into training and test data set training, test = data.select("label", "features").randomSplit([0.85, 0.15]) # Create Decision tree model and fit the model with training dataset dt = DecisionTreeClassifier() model = dt.fit(training) # Generate prediction from test dataset predictions = model.transform(test) # Evuluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions) # Show model accuracy print("Accuracy:", accuracy) # Report predictionAndLabels = predictions.select("label", "prediction").rdd metrics = MulticlassMetrics(predictionAndLabels) print("Confusion Matrix:", metrics.confusionMatrix()) print("Precision:", metrics.precision())
from pyspark.sql import SparkSession spark= SparkSession.builder.appName('mytree').getOrCreate() from pyspark.ml import Pipeline from pyspark.ml.classification import (RandomForestClassifier,GBTClassifier, DecisionTreeClassifier) data=spark.read.format('libsvm').load('sample_libsvm_data.txt') # Vemos una tabla con la columna label y la columna features data.show() # Dividimos en datos de entramiento y datos de prueba train_data,test_data=data.randomSplit([0.7,0.3]) dtc= DecisionTreeClassifier() rfc= RandomForestClassifier(numTrees=100) # cuantos más árboles agreguemos mayor será el tiempo de cálculo gbt= GBTClassifier() # Ajustamos los tres modelos dtc_model=dtc.fit(train_data) rfc_model=rfc.fit(train_data) gbt_model=gbt.fit(train_data) # Transformamos los datos de prueba para obtener predicciones dtc_preds= dtc_model.transform(test_data) rfc_preds= rfc_model.transform(test_data) gbt_preds= gbt_model.transform(test_data) # Tenemos las columnas: label|features|rawPrediction|probability|prediction. # prediction devolverá el actual label # Para Decision Tree and Random Forest, rawPredictionCol = 'rawPrediction' por defecto dtc_preds.show() # Seguimos teniendo las columnas: abel|features|rawPrediction|probability|prediction. rfc_preds.show() # Ahora solo tendriamos las columnas : label|features|prediction=GBT, yo he obtenido las mismas que antes gbt_preds.show() gbt_preds.printSchema() # Aunque este es un conjunto de datos de clasificación binario, si solo hago un evaluador de clasificación binario,
inferSchema=True, header=True) data.printSchema() print(data.head()) data.describe().show() assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol="features") output = assembler.transform(data) from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier rfc = DecisionTreeClassifier(labelCol='Spoiled', featuresCol='features') output.printSchema() final_data = output.select('features', 'Spoiled') final_data.head() rfc_model = rfc.fit(final_data) print( "-----------------feature importance --------------------------------------" ) print(rfc_model.featureImportances) spark.stop()
# COMMAND ---------- from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import DecisionTreeClassifier # Vectorize the features (all columns excluding the first one, Survived) features = trainDF.columns[1:] assembler = VectorAssembler(inputCols=features, outputCol="features") assembledTrainDF = assembler.transform(trainDF) # Train a decision tree, setting maxDepth parameter to 3 dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Survived", maxDepth=2) dtcModel = dtc.fit(assembledTrainDF) # Print the constructed tree print(dtcModel.toDebugString) # COMMAND ---------- # Visualize the decision tree display(dtcModel) # COMMAND ---------- # MAGIC %md # MAGIC ### Evaluate Feature Importance
#%% not necessary here data_train, data_test = data.randomSplit([.7, .3]) print(f'{data_train.count()}, {data_test.count()}') # 335, 155 #%% from pyspark.ml.classification import DecisionTreeClassifier, \ RandomForestClassifier, \ GBTClassifier est_dt = DecisionTreeClassifier(featuresCol='features', labelCol='SpoiledIdx') est_rf = RandomForestClassifier(featuresCol='features', labelCol='SpoiledIdx') est_gb = GBTClassifier(featuresCol='features', labelCol='SpoiledIdx') #%% model_dt = est_dt.fit(data_train) model_rf = est_rf.fit(data_train) model_gb = est_gb.fit(data_train) #%% dir(model_dt) model_dt.featureImportances model_rf.featureImportances model_gb.featureImportances #%% vv = model_gb.featureImportances dir(vv) type(vv) vv.values
+-----+-----------------------------------------------------------------------------+----------+ |label|features |prediction| +-----+-----------------------------------------------------------------------------+----------+ |0.0 |[1.0,1.0,1.0,31.0,-1.3333333333333333,3382.0,7293.5] |0.0 | |0.0 |[1.0,1.0,1.0,31.0,-1.3333333333333333,9791.833333333334,7911.5] |0.0 | |0.0 |[1.0,1.0,1.0,31.0,1.3333333333333333,54426.333333333336,2635.5] |1.0 | |1.0 |[1.0,1.0,1.0,34.0,0.0,126839.16666666667,7687.0] |0.0 | |1.0 |[1.0,1.0,1.0,34.0,1.3333333333333333,103421.66666666667,3367.1666666666665] |1.0 | |1.0 |[1.0,1.0,1.0,34.0,1.6666666666666667,115795.66666666667,5100.0] |1.0 | |1.0 |[1.0,1.0,1.0,35.0,-1.6666666666666667,575.6666666666666,0.0] |0.0 | Random Forest accuracy : 0.807145257028 ''' #Decision Trees model dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") dt_model = dt.fit(training_data) #Predict on the test data predictions = dt_model.transform(test_data) result = predictions.select("label", "features", "prediction") result.where('label = 1.0').show(20, False) result.where('label = 0.0').show(20, False) print 'Dicision Tree accuracy : ', evaluator.evaluate(predictions) ''' Dicision Tree accuracy : 0.805577332288 ''' #Naive Bayes model nb = NaiveBayes(labelCol="label", featuresCol="features") nb_model = nb.fit(training_data) #Predict on the test data predictions = nb_model.transform(test_data)
df3 = df3.select(df3.Pclass.cast('double'), df3.Gender1, df3.Embarked2, df3.Survived.cast('double')) df3.printSchema() # Vector assembler df3 = VectorAssembler(inputCols=['Pclass', 'Gender1', 'Embarked2'], outputCol='Features').transform(df3) df3.show(truncate=False) # # 1 choose approach from pyspark.ml.classification import DecisionTreeClassifier dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='Survived') # 2 learning process - created a model model2 = dt1.fit(df3) model2.depth model2.numFeatures # 3 get predictions df5 = spark.read.csv('E:/kaggle/titanic/test.csv', header=True).select('PassengerId', 'Sex', 'Pclass', 'Embarked') df5 = StringIndexer(inputCol='Embarked', outputCol='Embarked1').fit(df5).transform(df5) df5.show() df5 = OneHotEncoder(inputCol='Embarked1', outputCol='Embarked2',
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") lr = LogisticRegression(maxIter=10, regParam=0.01) #print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") model = lr.fit(final_train) predictions = model.transform(final_test) predictions.show() accuracy = evaluator.evaluate(predictions) print("LogisticRegression - Test set accuracy = " + str(accuracy)) dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") #print("DecisionTreeClassifier parameters:\n" + dt.explainParams() + "\n") model = dt.fit(final_train) predictions = model.transform(final_test) predictions.show() accuracy = evaluator.evaluate(predictions) print("DecisionTreeClassifier - Test set accuracy = " + str(accuracy)) rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) #print("RandomForestClassifier parameters:\n" + rf.explainParams() + "\n") model = rf.fit(final_train) predictions = model.transform(final_test) predictions.show() accuracy = evaluator.evaluate(predictions) print("RandomForestClassifier - Test set accuracy = " + str(accuracy))
# MAGIC # MAGIC You can read more about [Decision Trees](http://spark.apache.org/docs/latest/mllib-decision-tree.html) in the Spark MLLib Programming Guide. # MAGIC The Decision Trees algorithm is popular because it handles categorical # MAGIC data and works out of the box with multiclass classification tasks. # COMMAND ---------- from pyspark.ml.classification import DecisionTreeClassifier # Create initial Decision Tree Model dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3) # Train model with Training Data dtModel = dt.fit(trainingData) # COMMAND ---------- # MAGIC %md # MAGIC We can extract the number of nodes in our decision tree as well as the # MAGIC tree depth of our model. # COMMAND ---------- print("numNodes = ", dtModel.numNodes) print("depth = ", dtModel.depth) # COMMAND ---------- display(dtModel)
#String Indexer stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(dfUSD) td = si_model.transform(dfUSD) td.collect() td.show() #Splitting data (trainingData, testData) = td.randomSplit([0.6, 0.4]) trainingData.count() testData.count() testData.collect() #Creating decision tree model dtClassifer = DecisionTreeClassifier(labelCol="indexed",minInstancesPerNode=1500) dtModel = dtClassifer.fit(trainingData) dtModel.numNodes dtModel.depth #Predict on the test data predictions = dtModel.transform(trainingData) predictions = dtModel.transform(testData) predictions.select("prediction","indexed","label","features").show(10) #Evaluation evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \ labelCol="indexed",metricName="precision") evaluator.evaluate(predictions) #Draw a confusion matrix labelList=predictions.select("indexed","label").distinct().toPandas()
.format("libsvm")\ .load("data/classification.100000.4.txt") # 70% for training and 30% for testing (training_data, test_data) = data_frame.randomSplit([0.7, 0.3]) # Calculate the time the machine spend before training time_before_training = time.time() print("Time before training: " + str(time_before_training - initial_time)) # Create the decision_tree model decision_tree = DecisionTreeClassifier(labelCol="label", featuresCol="features") # Fit the model model = decision_tree.fit(training_data) # Calculate the time the machine is training training_time = time.time() print("Training time: " + str(training_time - time_before_training)) # Make the prediction prediction = model.transform(test_data) # Calculate the time the machine is testing with the model fitted testing_time = time.time() print("Test time: " + str(testing_time - training_time)) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
from pyspark.sql import SparkSession from pyspark.ml import Pipeline from pyspark.sql.functions import mean, col, split, col, regexp_extract, when, lit from pyspark.ml.feature import StringIndexer from pyspark.ml.feature import VectorAssembler from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.feature import QuantileDiscretizer indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(dfd) for column in ["1st Layer Clusters"] ] pipeline = Pipeline(stages=indexers) ddf = pipeline.fit(dfd).transform(dfd) ddf = ddf.drop("1st Layer Clusters" ) #,"Name","Ticket","Cabin","Embarked","Sex","Initial") # titanic_df.printSchema() # from pyspark.sql.types import IntegerType final = ddf.select([col(c).cast('int') for c in ddf.columns]) # final.printSchema() feature = VectorAssembler(inputCols=final.columns[1:], outputCol="features") feature_vector = feature.transform(final) (trainingData, testData) = feature_vector.randomSplit([0.8, 0.2], seed=11) from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(labelCol="1st Layer Clusters_index", featuresCol="features") dt_model = dt.fit(trainingData) dt_prediction = dt_model.transform(testData) dt_prediction.select("prediction", "1st Layer Clusters_index", "features").show() # trainingData
#section 8.2.6 # OneVsRest is not available in Python. #section 8.3.1 from pyspark.ml.feature import StringIndexer dtsi = StringIndexer(inputCol="label", outputCol="label-ind") dtsm = dtsi.fit(penlpoints) pendtlpoints = dtsm.transform(penlpoints).drop("label").withColumnRenamed("label-ind", "label") pendtsets = pendtlpoints.randomSplit([0.8, 0.2]) pendttrain = pendtsets[0].cache() pendtvalid = pendtsets[1].cache() from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(maxDepth=20) dtmodel = dt.fit(pendttrain) # rootNode is not accessible in Python dtpredicts = dtmodel.transform(pendtvalid) dtresrdd = dtpredicts.select("prediction", "label").map(lambda row: (row.prediction, row.label)) from pyspark.mllib.evaluation import MulticlassMetrics dtmm = MulticlassMetrics(dtresrdd) dtmm.precision() #0.951442968392121 print(dtmm.confusionMatrix()) #DenseMatrix([[ 205., 0., 3., 0., 0., 3., 1., 0., 0., # 0.], # [ 0., 213., 0., 1., 2., 1., 0., 2., 0., # 2.],
df_test =scaler.transform(df_test) df_test = df_test.drop('features').withColumnRenamed('scaledFeatures','features') # train data algo = DecisionTreeClassifier() grid = ParamGridBuilder().build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=algo, estimatorParamMaps=grid, numFolds=10, evaluator=evaluator) cv_model = cv.fit(df_train) pred = cv_model.transform(df_test) print("from {}, {} died. {}".format(pred.count(), pred.filter(pred.prediction == 0).count(),pred.filter(pred.prediction == 0).count()/pred.count())) pred_csv = pred.toPandas() pred_csv = pred_csv.filter(['PassengerId', 'prediction']) pred_csv = pred_csv.rename(columns={'prediction':'Survived'}) pred_csv.to_csv (r'lr.csv', index = False, header=True) data_train, data_test = df_train.randomSplit([0.8,0.2]) # train model logistic regression algo_t = DecisionTreeClassifier() model_t = algo_t.fit(data_train) pat = model_t.transform(data_test) print("from {}, {} died. {}".format(pat.count(), pat.filter(pat.prediction == 0).count(),pat.filter(pat.prediction == 0).count()/pat.count())) evaloator = evaluation.MulticlassClassificationEvaluator(metricName='accuracy') print("Accuracy DecisionTreeClassifier: {}".format(evaloator.evaluate((pat))))
#podział na zbiór treningowy i testowy train, test = model_data.randomSplit([0.7, 0.3], seed=2018) #statystyki danych w zbiorach testowym i treningowym print("Liczba danych w zbiorze treningowym:", train.count()) print("Liczba danych w zbiorze testowym:", test.count()) print("Struktura w zbiorze treningowym:") train.groupBy("label").count().show() print("Struktura w zbiorze testowym:") test.groupBy("label").count().show() #drzewo decyzyjne tstart = datetime.now() dt = DecisionTreeClassifier(featuresCol="features", labelCol="label") dtmodel = dt.fit(train) predictions = dtmodel.transform(test) tend = datetime.now() predictions.select('label', 'prediction', 'probability').show(10) print("dt time", tend - tstart) #obliczenie wartosci tp,tn,fp,fn tp = predictions[(predictions.label == 1) & (predictions.prediction == 1)].count() tn = predictions[(predictions.label == 0) & (predictions.prediction == 0)].count() fp = predictions[(predictions.label == 0) & (predictions.prediction == 1)].count() fn = predictions[(predictions.label == 1) & (predictions.prediction == 0)].count() print("True Positives:", tp)
cars_assembled = assembler.transform(cars) # Check the resulting column kars = cars_assembled.select('features', 'origin_idx') #kars.show(9) # Split data into training and testing sets kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23) print(kars_train.count(), kars_test.count()) # Create a Decision Tree classifier tree = DecisionTreeClassifier(labelCol="origin_idx") # Learn from training data tree = tree.fit(kars_train) # Make predictions on testing data prediction = tree.transform(kars_test) prediction.show(9) # Confusion matrix confusion_matrix = prediction.groupBy("origin_idx", "prediction").count() confusion_matrix.show() # Accuracy evaluator = MulticlassClassificationEvaluator(labelCol="origin_idx", metricName="accuracy") accuracy = evaluator.evaluate(prediction) print("Test set accuracy = " + str(accuracy))
# COMMAND ---------- train_data, test_data = data.randomSplit([0.7, 0.3]) # COMMAND ---------- gbc_class = GBTClassifier() rfc_class = RandomForestClassifier(numTrees=100) dtc_class = DecisionTreeClassifier() # COMMAND ---------- gbc_model = gbc_class.fit(train_data) rfc_model = rfc_class.fit(train_data) dtc_model = dtc_class.fit(train_data) # COMMAND ---------- gbc_pred = gbc_model.transform(test_data) rfc_pred = rfc_model.transform(test_data) dtc_pred = dtc_model.transform(test_data) # COMMAND ---------- #gbc_pred.show() # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# COMMAND ---------- clean_data = cleaner.transform(data) # COMMAND ---------- clean_data = clean_data.select('label', 'features') # COMMAND ---------- train, test = clean_data.randomSplit([0.7, 0.3]) # COMMAND ---------- spam_detctor = dct.fit(train) # COMMAND ---------- test_result = spam_detctor.transform(test) # COMMAND ---------- test_result.show() # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator # COMMAND ----------
from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.evaluation import MulticlassMetrics # In[62]: # Train DecisionTree model. dt = DecisionTreeClassifier(featuresCol='minmax_scaled_features', labelCol='class_attack') # In[63]: dtModel = dt.fit(conn_train) # ## Make predictions on the test set # In[64]: # Run prediction on the whole dataset conn_test_pred = dtModel.transform(conn_test) # In[65]: conn_test_pred.limit(3).toPandas() # In[66]: # Select example rows to display.
passengersWithFilledEmptyValues = imputer.fit(passengers).transform( passengers) passengersWithFilledEmptyValues.show() # look at first row # Step - 4: Transform dataframe to vectorized dataframe output = assembler.transform(passengersWithFilledEmptyValues).select( "features", "survived" ) # <============== drop row if it has nulls/NaNs in the next list of columns) output.show() # Step - 5: Set up the Decision Tree Classifier trainer = DecisionTreeClassifier(labelCol="survived", featuresCol="features") # Step - 6: Train the model model = trainer.fit(output) # Step - 7: Predict with the model rawPredictions = model.transform(output) # Step - 8: Evaluate prediction evaluator = MulticlassClassificationEvaluator(labelCol="survived", predictionCol="prediction", metricName="accuracy") # Step - 9: Calculate accuracy accuracy = evaluator.evaluate(rawPredictions) print("Test Error = %g " % (1.0 - accuracy)) # Step - 10: Print out the model print(model.toDebugString)
# Check first five records flights.show(5) # Create an assembler object assembler = VectorAssembler(inputCols=[ "mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration" ], outputCol='features') # Consolidate predictor columns flights_assembled = assembler.transform(flights) # Check the resulting column flights = flights_assembled.select('features', 'xdelay') # Split into training and testing sets in a 80:20 ratio flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17) # Create a classifier object and fit to the training data tree = DecisionTreeClassifier(labelCol="xdelay") tree_model = tree.fit(flights_train) # Create predictions for the testing data and take a look at the predictions prediction = tree_model.transform(flights_test) predictions = prediction.select('xdelay', 'prediction', 'probability') print(predictions.toPandas().sample(12)) spark.stop()
# OneVsRest is not available in Python. #section 8.3.1 from pyspark.ml.feature import StringIndexer dtsi = StringIndexer(inputCol="label", outputCol="label-ind") dtsm = dtsi.fit(penlpoints) pendtlpoints = dtsm.transform(penlpoints).drop("label").withColumnRenamed( "label-ind", "label") pendtsets = pendtlpoints.randomSplit([0.8, 0.2]) pendttrain = pendtsets[0].cache() pendtvalid = pendtsets[1].cache() from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(maxDepth=20) dtmodel = dt.fit(pendttrain) # rootNode is not accessible in Python dtpredicts = dtmodel.transform(pendtvalid) dtresrdd = dtpredicts.select( "prediction", "label").rdd.map(lambda row: (row.prediction, row.label)) from pyspark.mllib.evaluation import MulticlassMetrics dtmm = MulticlassMetrics(dtresrdd) dtmm.precision() #0.951442968392121 print(dtmm.confusionMatrix()) #DenseMatrix([[ 205., 0., 3., 0., 0., 3., 1., 0., 0., # 0.], # [ 0., 213., 0., 1., 2., 1., 0., 2., 0.,
def prepDataForML(df): # https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa cols = df.columns categoricalColumns = ['School'] stages = [] for categoricalCol in categoricalColumns: # indexes each categorical column using the StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index') # converts the indexed categories into one-hot encoded variables encoder = OneHotEncoderEstimator( inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) stages += [stringIndexer, encoder] # StringIndexer again to encode our labels to label indices label_stringIdx = StringIndexer(inputCol='Drafted', outputCol='label') stages += [label_stringIdx] # VectorAssembler to combine all the feature columns into a single vector column numericCols = [ 'Games Played', 'Att (rushing)', 'Yds (rushing)', 'Avg (rushing)', 'TD (rushing)', 'Rec (receiving)', 'Yds (receiving)', 'Avg (receiving)', 'TD (receiving)', 'Plays (scrimmage)', 'Yds (scrimmage)', 'Avg (scrimmage)', 'TD (scrimmage)', 'Year' ] assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] # PipeLine for the ML data to follow pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df) df = pipelineModel.transform(df) selectedCols = ['label', 'features'] + cols df = df.select(selectedCols) #df.printSchema() #df.show() # Randomly split data into train and test sets, and set seed for reproducibility. train, test = df.randomSplit([0.7, 0.3], seed=2018) train.cache() test.cache() # print("Training Dataset Count: " + str(train.count())) # print("Test Dataset Count: " + str(test.count())) # Apply machine learing to it # Logistic Regression --> ROC: 85%, Accuracy: 88.73239436619719% # lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10) # lrModel = lr.fit(train) # predictions = lrModel.transform(test) # evaluator = BinaryClassificationEvaluator() # print('Test Area Under ROC', evaluator.evaluate(predictions) * 100) # evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") # accuracy = evaluator.evaluate(predictions) # print("Test set accuracy = " + str(accuracy)) # Random Forest --> ROC: 83.90126725368875%, Accuracy: 90.02347417840375% # rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) # rfModel = rf.fit(train) # predictions = rfModel.transform(test) # # predictions.show(1000) # evaluator = BinaryClassificationEvaluator() # print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}))) # evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") # accuracy = evaluator.evaluate(predictions) # print("Test set accuracy = " + str(accuracy)) # Linear SVM --> ROC: 77.35899571632558, Accuracy: 89.67136150234741% # lsvc = LinearSVC(maxIter=10, regParam=0.1) # lsvcModel = lsvc.fit(train) # predictions = lsvcModel.transform(test) # evaluator = BinaryClassificationEvaluator() # print('Test Area Under ROC', evaluator.evaluate(predictions) * 100) # evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") # accuracy = evaluator.evaluate(predictions) # print("Test set accuracy = " + str(accuracy)) # Decision Tree --> Area Under ROC: 83.31151832460733, Accuracy: 86.97183098591549% dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") dtModel = dt.fit(train) # Cross Vaildation predictions = dtModel.transform(test) evaluator = BinaryClassificationEvaluator() print('Test Area Under ROC', evaluator.evaluate(predictions) * 100) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test set accuracy = " + str(accuracy))
# selected features of preprocessed datasets for modeling allFeatures = ['salutation_Vec','newsletter','model_Vec','paymenttype_Vec','voucher','case','numberitems','gift','entry','shippingcosts','weight','remi','cancel','used','w0','w1','w2','w3','w4','w5','w6','w7','w8','w9','w10','books','nobooks','itemseff'] assembler = VectorAssembler(inputCols=allFeatures, outputCol='features') dfTrain = assembler.transform(dfTrain) dfTest = assembler.transform(dfTest) # Training + Prediction ################################### revenues = [] # List of generated results #### Decision Tree Classifier start_time = time() tree = DecisionTreeClassifier() tree_model = tree.fit(dfTrain) predictionTree = tree_model.transform(dfTest) end_time = time() elapsed_time = end_time - start_time print("Time to train Tree on dfTrain and make predictions on dfTest: %.3f seconds" % elapsed_time) # Evaluation predictionTree.groupBy("label", "prediction").count().show() start_time = time() #revenue = costMatrix(predictionTree) tn = predictionTree[(predictionTree.label == 0) & (predictionTree.prediction == 0.0)].count() fn = predictionTree[(predictionTree.label == 1) & (predictionTree.prediction == 0.0)].count() revenue = (tn * 1.5 - fn * 5) #revenue based on costMatrix revenues.append(revenue) end_time = time() elapsed_time = end_time - start_time
df3.show(truncate=False) training = df3 training1 = df3 training.show(truncate=False, n=5) # 1 choose approach from pyspark.ml.classification import DecisionTreeClassifier dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='Survived', seed=5000) # 2 learning process - created a model model2 = dt1.fit(training) model2.depth model2.numFeatures training1 = model2.transform(training) training1.show(5) PredictionandLabels = training1.select(training1.prediction, training1.Survived).rdd PredictionandLabels.collect() from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics # metrics1 = BinaryClassificationMetrics(PredictionandLabels) # (train score/train accuracy --- ) # (train error = 1-train score ?) metrics2 = MulticlassMetrics(PredictionandLabels)
summary = lrModel.summary print summary.areaUnderROC summary.roc.show() summary.pr.show() # COMMAND ---------- summary.objectiveHistory # COMMAND ---------- from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier() print dt.explainParams() dtModel = dt.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier rfClassifier = RandomForestClassifier() print rfClassifier.explainParams() trainedModel = rfClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import GBTClassifier gbtClassifier = GBTClassifier() print gbtClassifier.explainParams() trainedModel = gbtClassifier.fit(bInput)
################### DD. Decision Tree #################### # 10.0 Call library from pyspark.ml.classification import DecisionTreeClassifier # 10.1 Instantiate modeling object: dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3 ) # 10.2 Fit/train model dtModel = dt.fit(train) # 10.3 MAke predictions and evaluate predictions = dtModel.transform(test) predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10) # 10.4 evaluator = BinaryClassificationEvaluator() evaluator.evaluate(predictions) ################ EE. Random Forest Classification ################## # 11.0 Import library from pyspark.ml.classification import RandomForestClassifier # 11.1 Instantiate object and fit:
from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(predictionCol="Predicted_median",labelCol="label",featuresCol="features",numTrees = 40, maxDepth = 30) rfModel = rf.fit(Training_set) rf_predictions = rfModel.transform(Test_set) rf_predictions.filter(rf_predictions['Predicted_median'] == 0) \ .select("features","label","Predicted_median","probability") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) evaluator = BinaryClassificationEvaluator() print("Test_SET (Area Under ROC): " + str(evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"}))) ## DECISION TREES from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(predictionCol='Predicted_median',featuresCol = 'features', labelCol = 'label',maxDepth = 30) dtModel = dt.fit(Training_set) dt_predictions = dtModel.transform(Test_set) dt_predictions.select('features', 'label', 'rawPrediction', 'Predicted_median', 'probability').show(10) evaluator = BinaryClassificationEvaluator() print("Test_SET (Area Under ROC): " + str(evaluator.evaluate(dt_predictions, {evaluator.metricName: "areaUnderROC"}))) # LETS TRY CROSS VALIDATION ON THE GRADIENT BOOSTING MODEL TO SEE IF THE PERFORMANCE IMPROVES. #GRADIENT BOOSTING WITH CROSS VALIDATION from pyspark.ml.tuning import ParamGridBuilder, CrossValidator paramGrid = (ParamGridBuilder().addGrid(gb.maxDepth, [2, 4, 10]).addGrid(gb.maxBins, [10, 20]).addGrid(gb.maxIter, [10, 25]).build()) cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations. cvModel = cv.fit(Training_set) gb_cv_predictions = cvModel.transform(Test_set) gb_cv_predictions.select('features', 'label', 'rawPrediction', 'Predicted_median', 'probability').show(10)
# MAGIC %md # MAGIC ####Decision Trees # MAGIC You can read more about Decision Trees from the Programming Guide [here](http://spark.apache.org/docs/latest/mllib-decision-tree.html). # MAGIC # MAGIC Decision Trees is a popular algorithm as it can handle categorical data and work with multiclass data. # COMMAND ---------- from pyspark.ml.classification import DecisionTreeClassifier # Create initial Decision Tree Model dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3) # Train model with Training Data dtModel = dt.fit(trainingData) # COMMAND ---------- # MAGIC %md We can extract the number of nodes in our decision tree as well as the tree depth of our model. # COMMAND ---------- print "numNodes = ", dtModel.numNodes print "depth = ", dtModel.depth # COMMAND ---------- # Make predictions on test data using the Transformer.transform() method. predictions = dtModel.transform(testData)
from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainTFIDF) dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF) dfTrainFinal.select('review','label','target_indexed').show() #********************************************************************** #-----------Training the model for prediction-------------------------- #********************************************************************** from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(),labelCol=string_indexer.getOutputCol()) dt_model = dt.fit(dfTrainFinal) # On applique le même à notre ensemble de test ridicule. # En théorie le pipeline permet d'automatiser tout ça mais bon, on s'en servira probablement pas # EDIT : en fait c'est plutot facile de créer des transformers à partir de chaque étape, donc peut # être que les pipelines c'est faisables. A voir df_test_words = tokenizer.transform(dfTest) df_test_tf = htf.transform(df_test_words) df_test_tfidf = idfModel.transform(df_test_tf) df_test_final = string_indexer_model.transform(df_test_tfidf) # Les prédictions df_test_pred = dt_model.transform(df_test_final) df_test_pred.select('review', 'target_indexed', 'prediction', 'probability').show(5)