from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary blor = LogisticRegression() blorModel = blor.fit(train_data) result = blorModel.transform(train_data) predictresult = blorModel.transform(test_data) predictresult.show() print('LogistRegression:') printMetrics(result) print('\n') # result.show() # 计算准确率 # print(result.filter(result.label == result.prediction).count()/result.count()) from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(maxDepth=5, maxBins=1600) dtModel = dt.fit(train_data) result = dtModel.transform(train_data) predictresult = dtModel.transform(test_data) predictresult.show() print('DecisionTree:') printMetrics(result) print('\n') # accuracy # print(result.filter(result.label == result.prediction).count()/result.count()) from pyspark.ml.classification import GBTClassifier gbt = GBTClassifier(maxDepth=5, maxBins=1600) gbtModel = gbt.fit(train_data) result = gbtModel.transform(train_data) predictresult = gbtModel.transform(test_data)
# Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[2] print(rfModel) # summary only print(accuracy) # # DecisionTreeClassifier from pyspark.ml.classification import DecisionTreeClassifier bt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features") pipeline = Pipeline(stages=[labelIndexer, vectorAssembler, bt, labelConverter]) model = pipeline.fit(train_dataset) model.write().overwrite().save( "s3a://wineappcloud/DecisionTreeClassifier.model") # Make predictions. predictions = model.transform(validation_dataset) # Select example rows to display. predictions.select("predictedLabel", total_columns[-1], "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions)
# Fit on whole dataset to include all labels in index. #labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(vd) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. #featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",maxCategories=4).fit(vd) # Train a DecisionTree model. #dtr = DecisionTreeRegressor(featuresCol="indexedFeatures") # Split the data into training and test sets (30% held out for testing) (training, test) = vd.randomSplit([0.7, 0.3]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(training) td = si_model.transform(training) dt = DecisionTreeClassifier(maxDepth=7, maxBins=32, labelCol="indexed") model = dt.fit(td) print(model.numNodes) print(model.depth) print(model.featureImportances) print(model.numFeatures) print(model.numClasses) print(model.toDebugString) result = model.transform(test).head() # Train a DecisionTree model. #dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxBins = 4, maxDepth = 7, impurity = "gini") # Chain indexers and tree in a Pipeline #pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
irislp = irisNormDf.rdd.map(transformToLabeledPoint) irisLpDf = SpSession.createDataFrame(irislp, ["species", "label", "features"]) irisLpDf.select('label','features', 'species').show(10) irisLpDf.cache() #Split the training data: (trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1]) trainingData.count() testData.count() testData.collect() from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator dt_classifier = DecisionTreeClassifier(maxDepth = 2, labelCol = 'label', featuresCol = 'features') dt_model = dt_classifier.fit(trainingData) dt_model.numNodes dt_model.depth ###### predictions = dt_model.transform(trainingData) predictions.select('prediction', 'species', 'label').collect() #### evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'label', metricName = 'accuracy') evaluator.evaluate(predictions) cv_predict = dt_model.transform(testData) evaluator.evaluate(cv_predict)
# MAGIC # MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> See <a href="https://en.wikipedia.org/wiki/Hyperparameter_optimization" target="_blank">the Wikipedia article on hyperparameter optimization</a> for more information. # COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import DecisionTreeClassifier titanicDF = spark.read.table("titanic_clean").cache() trainDF, testDF = titanicDF.randomSplit([0.8, 0.2], seed=10) assembler = VectorAssembler(inputCols=titanicDF.columns[1:], outputCol="features") dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Survived") pipeline = Pipeline(stages=[assembler, dtc]) # COMMAND ---------- # MAGIC %md-sandbox # MAGIC `ParamGridBuilder()` allows us to string together all of the different possible hyperparameters we would like to test. In this case, we can test the maximum number of iterations, whether we want to use an intercept with the y axis, and whether we want to standardize our features. # MAGIC # MAGIC <img alt="Caution" title="Caution" style="vertical-align: text-bottom; position: relative; height:1.3em; top:0.0em" src="https://files.training.databricks.com/static/images/icon-warning.svg"/> Since grid search works through exhaustively building a model for each combination of parameters, it quickly becomes a lot of different unique combinations of parameters. # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder paramGrid = (ParamGridBuilder().addGrid(dtc.maxDepth, [2, 3, 4, 5, 6]).addGrid(
.map(lambda row: LabeledPoint(row.label,Vectors.fromML(row.features))) trainRdd.take(1) from pyspark.ml.classification import * lr = LogisticRegression(maxIter=10, regParam=0.01) lrModel = lr.fit(trainDf) # print "* summary: ", lrModel.summary # print "* coefficients: ", lrModel.coefficients # print "* intercept: ", lrModel.intercept from pyspark.mllib.classification import LogisticRegressionWithSGD lrm = LogisticRegressionWithSGD.train(trainRdd, iterations=10) from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") model=dt.fit(trainDf) from pyspark.mllib.regression import LabeledPoint #_rdd=df.rdd.map(lambda x:LabeledPoint(x.class,[1,1])) _rdd=df.rdd.map(lambda x:LabeledPoint(x.cls,[1,1])) _rdd.first() from pyspark.mllib.clustering import LDA #from pyspark.mllib.linalg import SparseVector, Vector, Vectors from pyspark.ml.feature import CountVectorizer, RegexTokenizer regexTok = RegexTokenizer(inputCol="sent", outputCol="wordsRegex", pattern="\\s+") reDf=regexTok.transform(df) cv = CountVectorizer(inputCol="wordsRegex", outputCol="cv")
----------------------------------------- df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double')) df3.show() df3.printSchema() # Vector assembler from pyspark.ml.feature import VectorAssembler df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3) df3.show() # # 1 choose approach from pyspark.ml.classification import DecisionTreeClassifier dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy') # 2 learning process - created a model model1 = dt1.fit(df3) model1.depth print(model1.toDebugString) # 3 get predictions df5 = spark.read.csv('E:/kaggle/titanic/test.csv',header=True).select('PassengerId','Pclass','SibSp') df5 df5 = df5.select(df5.Pclass.cast('double'),df5.SibSp.cast('double'),df5.PassengerId)
(trainingData, testData) = indexedData.randomSplit([0.8, 0.2]) # ### DecisionTree Classifier # * Specify the features and label columns # * <b>maxDepth: </b>The maximum depth of the decision tree # * <b>impurity: </b>We use gini instead of entropy. Gini measurement is the probability of a random sample being classified correctly. Entropy is a measure of information (seek to maximize information gain when making a split). Outputs generally don't vary much when either option is chosen, but entropy may take longer to compute as it calculates a logarithm # In[17]: from pyspark.ml.classification import DecisionTreeClassifier dtree = DecisionTreeClassifier( labelCol='indexedLabel', featuresCol='features', maxDepth=3, impurity='gini' ) # #### Traing the model using the training data # In[18]: model = dtree.fit(trainingData) # #### Use Spark ML's MulticlassClassificationEvaluator to evaluate the model # * Used to evaluate classification models # * It takes a set of labels and predictions as input
metrics1 = MulticlassMetrics(PredictionsandLabels) metrics1.accuracy rf1 = RandomForestClassifier(featuresCol='Features',labelCol='Survived',numTrees=100) rf_model1 = rf1.fit(training) rf_model1.getNumTrees rf_model1.numClasses print(rf_model1.featureImportances) training20 = rf_model1.transform(training) from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived', maxDepth=30) from pyspark.mllib.classification import Random from pyspark.ml.classification import RandomForestClassifier, GBTClassifier rf1 = RandomForestClassifier(featuresCol='Features',labelCol='Survived',numTrees=1000) rf_model1 = rf1.fit(training) rf_model1.getNumTrees print(rf_model1.toDebugString) print(rf_model1.featureImportances) training2_1 = rf_model1.transform(training) training2_1.select('prediction','Survived').show()
# Apply Logsitic Regression from pyspark.ml.classification import LogisticRegression # regPara: regualrization parameter lr = LogisticRegression(maxIter=100, regParam=0.05, labelCol='index').fit(train) # Evaluate model based on auc ROC(default for binary classification) from pyspark.ml.evaluation import BinaryClassificationEvaluator def testModel(model, validate=validate): pred = model.transform(validate) evaluator = BinaryClassificationEvaluator(labelCol='index') return evaluator.evaluate(pred) print('****************************************************AUC ROC is' + str(testModel(lr))) from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train) rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train) models = {'LogisticRegression': lr, 'DecistionTree': dt, 'RandomForest': rf} modelPerf = {k: testModel(v) for k, v in models.iteritems()} print(modelPerf)
# In[16]: #trainingData.count(),testData.count() # # Create and train decision tree : # The labelCol argument is the column we are trying to predict, featuresCol specifies the aggregated features column, maxDepth is stopping criterion for tree induction based on maximum depth of tree, minInstancesPerNode is stopping criterion for tree induction based on minimum number of samples in a node, and impurity is the impurity measure used to split nodes. # # We can create a model by training the decision tree. This is done by executing it in a Pipeline: # In[17]: dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, minInstancesPerNode=20, impurity="gini") # In[18]: pipeline = Pipeline(stages=[dt]) model = pipeline.fit(trainingData) # Let's make predictions using our test data set: # In[19]: predictions = model.transform(testData) # Looking at the first ten rows in the prediction, we can see the prediction matches the input:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, DecisionTreeRegressor from pyspark.ml import Pipeline from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator from pyspark.ml.feature import VectorAssembler, StringIndexer spark = SparkSession.builder.appName('Tree').getOrCreate() base_path = '/home/edoardo/Udemy/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/' file_name = 'dog_food.csv' data = spark.read.csv(base_path + file_name, header=True, inferSchema=True) data.printSchema() data.show() cols = ['A', 'B', 'C', 'D'] data.show() assembler = VectorAssembler(inputCols=cols, outputCol='features') output = assembler.transform(data) dtc = DecisionTreeClassifier(labelCol='Spoiled', featuresCol='features') dtc_model = dtc.fit(output) rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features') rfc_model = rfc.fit(output) print(dtc_model.featureImportances) print(rfc_model.featureImportances)
pipeline = Pipeline( stages=[cbwd_Indexer, Harm_Indexer, cbwd_encoder, assembler]) pipeline_model = pipeline.fit(df) pipe_df = pipeline_model.transform(df) pipe_df = pipe_df.select('label', 'features') pipe_df.printSchema() # In[ ]: # Decision Tree Classifier from pyspark.ml.classification import DecisionTreeClassifier train_data, test_data = pipe_df.randomSplit([0.7, 0.3]) print("Training Dataset Count: " + str(train_data.count())) print("Test Dataset Count: " + str(test_data.count())) dt = DecisionTreeClassifier(featuresCol='features', labelCol='label') dt_Model = dt.fit(train_data) predictions = dt_Model.transform(test_data) predictions.select("prediction", "label", "features").toPandas() # In[10]: # Randomforest Classifier from pyspark.ml.classification import RandomForestClassifier train_data, test_data = pipe_df.randomSplit([0.8, 0.2]) print("Training Dataset Count: " + str(train_data.count())) print("Test Dataset Count: " + str(test_data.count())) rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=30,
# 对训练集的数据进行词转向量的转化 manbing = model.transform(train_set) end = time.time() print("词转向量用时:{}".format(end - start)) """ 训练数据构建模型 """ start = time.time() # 从源数据中指定特征列 assembler = VectorAssembler(inputCols=["d_func_result"], outputCol="features") # assembler对象是一个transformer,将多列数据转化为单列的向量列(决策树可以识别的类型) # train_set2 = assembler.transform(manbing) # 构建决策树,配置标签列和特征列 dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") # 构建一个评估器(用于二分类问题的结果评估) # 配置参数包括(评估的标签)、评估的单位AUC和ACC(默认AUC)、(???) evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='label', metricName='areaUnderROC') # 构建参数网格旨在减少工作量及需要设置超参数(???) # 参数包括决策树纯度参数,并设置基尼指数和熵、决策树的最大深度、决策树的最大划分数 paramGrid = ParamGridBuilder().addGrid(dt.impurity, ['gini', 'entropy'])\ .addGrid(dt.maxDepth, [5, 10, 15]).addGrid(dt.maxBins, [10,])\ .build() # 构建交叉验证器 # 参数包括:决策树,评估器,评估器参数.
res_lr = lr.transform(test_lr) #----------------- Decision and Random Forest ----------------- # Final assembly inputCols = ['norm_cols' ] + [cname + "classVec" for cname in categorical_cols] final_assembler = VectorAssembler(inputCols=inputCols, outputCol='features') stages += [final_assembler] pipeline = Pipeline(stages=stages) train_final = pipeline.fit(train).transform(train) test_final = pipeline.fit(test).transform(test) dt = DecisionTreeClassifier(featuresCol='features', labelCol='label').fit(train_final) res_dt = dt.transform(test_final) rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=20).fit(train_final) res_rf = rf.transform(test_final) res_lr.select('prediction', 'label').write.csv(sys.argv[2] + "lr", header=True) res_dt.select('prediction', 'label').write.csv(sys.argv[2] + "dt", header=True) res_rf.select('prediction', 'label').write.csv(sys.argv[2] + "rf", header=True) spark.stop()
evaluator.evaluate(predictionDf_gbt, {evaluator.metricName: "areaUnderROC"}))) # In[279]: #original gave Test Area Under ROC: 0.8277322240593465 #wow... it got reduced... just marginally but... great: Test Area Under ROC: 0.8217262618899654 # ## Decision tree # In[1]: from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(featuresCol='features', labelCol='label', maxDepth=3) dtModel = dt.fit(train) predictions = dtModel.transform(test) # In[281]: crossval_d = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) # In[282]: cvModel_d = crossval_d.fit(train)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(irisDF) td = si_model.transform(irisDF) td.collect() #Split into training and testing data (trainingData, testData) = td.randomSplit([0.9, 0.1]) trainingData.count() testData.count() testData.collect() from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator #Create the model dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="indexed") dtModel = dtClassifer.fit(trainingData) dtModel.numNodes dtModel.depth #Predict on the test data predictions = dtModel.transform(trainingData) predictions.select("prediction", "indexed", "label", "features").collect() evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \ labelCol="indexed",metricName="precision") evaluator.evaluate(predictions) #Draw a confusion matrix labelList = predictions.select("indexed", "label").distinct().toPandas() predictions.groupBy("indexed", "prediction").count().show()
print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) #**************************************************************** #*********************CROSS VALIDATION: 80%/20%****************** #*******************Model: DecisionTreeClassifier***************** #***************************************************************** evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
from pyspark.ml.evaluation import MulticlassClassificationEvaluator spark = SparkSession.builder.appName('Tree').getOrCreate() base_path = '/home/edoardo/Udemy/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Tree_Methods/' file_name = 'sample_libsvm_data.txt' data = spark.read.format('libsvm').load(base_path + file_name) data.printSchema() data.show() train_data, test_data = data.randomSplit([0.7, 0.3]) # These classifiers are all initialized to their default values! dtc = DecisionTreeClassifier() gbt = GBTClassifier() rfc = RandomForestClassifier(numTrees=100) # Fit the models dtc_model = dtc.fit(train_data) gbt_model = gbt.fit(train_data) rfc_model = rfc.fit(train_data) # Get the predictions dtc_preds = dtc_model.transform(test_data) gbt_preds = gbt_model.transform(test_data) rfc_preds = rfc_model.transform(test_data) # Show the predictions dtc_preds.show()
# Vector assembler df5 = VectorAssembler(inputCols=[ 'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2' ], outputCol='Features').transform(df4) df5.show(truncate=False) df5.printSchema() # -------------------------------------------------------------------------- # data processing complete--- # 6 .Model building training = df5 training.show(truncate=False, n=5) from pyspark.ml.classification import DecisionTreeClassifier dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='type1', maxDepth=10) model22 = dt1.fit(training) model22.depth #model22.numFeatures training2 = model22.transform(training) PredictionsandLabels = training2.select('prediction', 'type1').rdd PredictionsandLabels.collect() # -------------------------------------------------------------- #Resubstitution approach from pyspark.mllib.evaluation import MulticlassMetrics metrics1 = MulticlassMetrics(PredictionsandLabels) metrics1.accuracy # -------------------------------------------------------------------------- # data processing complete---
training_data, test_data = SVM_df.randomSplit([0.6, 0.4], seed=123) SVM_df.count() #Modelovanie print "---------------------------------------------------------------------" print "-----------------------------Modelovanie-----------------------------" print "---------------------------------------------------------------------" #Decision tree classifier print "-------------------------------------------------" print "---------------DESICION TREE---------------" print "-------------------------------------------------" tree_classifier = DecisionTreeClassifier(featuresCol="features", labelCol="Accident_Severity", impurity="entropy", maxDepth=10, maxBins=100) tree_model = tree_classifier.fit(training_data) predictions = tree_model.transform(test_data) #print(tree_model.toDebugString) test_error = predictions.filter( predictions["prediction"] != predictions["Accident_Severity"]).count( ) / float(test_data.count()) print "Testing error: {0:.4f}".format(test_error) # Select example rows to display. predictions.select("prediction", "Accident_Severity", "features").show(5) #Model rozhodovacie stromu print(tree_model.toDebugString) #vyhodnotenie decision tree evaluatorMulti = MulticlassClassificationEvaluator(
print("Training Dataset Count: " + str(training_data.count())) print("Test Dataset Count: " + str(testing_data.count())) # testing_data.show(3) # model buit and evaluation # ---------------- Description about the evaluation elements --------------------------------- # input of Decision Model -->label (label to predict labelcol) { to access use labelCol } # -->features_all (feature use for prediction in vector form) { to access use features_allCol } #output of Decision --> prediction(actual predicton of model) {to access use predictionCol} # --> rawPrediction() { to access use rawPredictionCol } # --> probability() { to access ues probabilityCol } # Train the model DecisonM = DecisionTreeClassifier(featuresCol='features_all', labelCol='Pred_Label', maxDepth=20) start=time.time() DecisionModel = DecisonM.fit(training_data) end=time.time() start1=time.time() f_predictions = DecisionModel.transform(testing_data) end1=time.time() # #PRINT CONFUSION MATRIX Cm=f_predictions.select("Pred_Label","label").distinct().toPandas() f_predictions.groupBy("Pred_Label","prediction").count().show()
spark = sql.SparkSession.builder \ .master('local') \ .appName('Decision Trees') \ .getOrCreate() df = spark.read \ .format('com.databricks.spark.csv') \ .option('inferSchema', value=True) \ .option('header', 'true') \ .load('data/mushrooms/mushrooms.data') categories = ['cap-shape', 'cap-surface', 'cap-color'] # create categorical columns for above 3 features df = reduce(string_to_index, categories, df) # gather features columns into a single features column df = VectorAssembler(inputCols=['i-' + x for x in categories], outputCol='features').transform(df) # create label column df = StringIndexer(inputCol='edible?', outputCol='label').fit(df) \ .transform(df) # fit Decision Trees model tree = DecisionTreeClassifier() model = tree.fit(df) print(model.toDebugString) bce = BinaryClassificationEvaluator() print(bce.evaluate(model.transform(df)))
train_set, test_set = dataset.randomSplit([0.75, 0.25], seed=2019) print("Training set Count: " + str(train_set.count())) print("Test set Count: " + str(test_set.count())) # Logistic Regression model lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.8, featuresCol="features", labelCol="label_num", family="multinomial") # Decision Tree model dt = DecisionTreeClassifier(labelCol="label_num", featuresCol="features", maxBins=70) # Random Forest model rf = RandomForestClassifier(labelCol="label_num", featuresCol="features", numTrees=20, maxBins=70) # Naive Bayes Multinomial nb = NaiveBayes(labelCol="label_num", featuresCol="features", smoothing=1.0, modelType="multinomial") classifiers = {
labelCol=row.schema.names[-1]) lrModel = lr.fit(train) # testing predictions = lrModel.transform(test) # predictions.select(row.schema.names[-1], 'rawPrediction', 'prediction', 'probability').show(1000) accuracy = evaluator.evaluate(predictions) print("(Logistic Regression) Testing Accuracy = %g " % accuracy) # ### Decision Tree from pyspark.ml.classification import DecisionTreeClassifier # training dt = DecisionTreeClassifier(featuresCol="features", labelCol=row.schema.names[-1], maxDepth=10) dtModel = dt.fit(train) # testing predictions = dtModel.transform(test) # predictions.select(row.schema.names[-1], 'rawPrediction', 'prediction', 'probability').show(1000) accuracy = evaluator.evaluate(predictions) print("(Decision Tree) Testing Accuracy = %g " % accuracy) # ### Random Forest from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator # training
from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import DecisionTreeClassifier # Encodes a string column of labels to a column of label indices indexer = StringIndexer(inputCol="type", outputCol="typeIndexed") # VectorAssembler is a transformer that combines a given list of columns into a single vector column va = VectorAssembler(inputCols=[ "typeIndexed", "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "orgDiff", "destDiff" ], outputCol="features") # Using the DecisionTree classifier model dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", seed=54321, maxDepth=5) # Create our pipeline stages pipeline = Pipeline(stages=[indexer, va, dt]) # COMMAND ---------- # View the Decision Tree model (prior to CrossValidator) dt_model = pipeline.fit(train) display(dt_model.stages[-1]) # COMMAND ---------- # MAGIC %md ### Use BinaryClassificationEvaluator # MAGIC Determine the accuracy of the model by reviewing the `areaUnderPR` and `areaUnderROC`
"label = 1 and prediction = 0").count() print("truePositive: " + str(truePositive)) print("falsePositive: " + str(falsePositive)) print("trueNegative: " + str(trueNegative)) print("falseNegative: " + str(falseNegative)) print("-----") # COMMAND ---------- # MAGIC %md #6. Decision tree - different algorithm # COMMAND ---------- dtModel = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3).fit(trainingData) predictions = dtModel.transform(testData) truePositive = predictions.select("label").filter( "label = 1 and prediction = 1").count() falsePositive = predictions.select("label").filter( "label = 0 and prediction = 1").count() trueNegative = predictions.select("label").filter( "label = 0 and prediction = 0").count() falseNegative = predictions.select("label").filter( "label = 1 and prediction = 0").count() print("truePositive: " + str(truePositive)) print("falsePositive: " + str(falsePositive))
label_list = dataset.select(["Label", "Label_Idx"]).distinct().orderBy("Label_Idx").select("Label").rdd.flatMap(lambda x: x).collect() # print(label_list) dataset = dataset.select(["features","Label_Idx"]) # dataset.printSchema() train_set, test_set = dataset.randomSplit([0.75, 0.25], seed=2019) print("Training set Count: " + str(train_set.count())) print("Test set Count: " + str(test_set.count())) # Logistic Regression model lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.8, featuresCol="features", labelCol="Label_Idx", family="multinomial") # Decision Tree model dt = DecisionTreeClassifier(labelCol="Label_Idx", featuresCol="features", maxBins=len(features)) # Random Forest model rf = RandomForestClassifier(labelCol="Label_Idx", featuresCol="features", numTrees=20, maxBins=len(features)) # Naive Bayes Multinomial nb = NaiveBayes(labelCol="Label_Idx", featuresCol="features", smoothing=1.0, modelType="multinomial") classifiers = {"Logistic Regression": lr, "Decision Tree": dt, "Random Forest": rf, "Naive Bayes Multinomial": nb} metrics = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] print("\nModels Evaluation:") print("{:-<24}".format("")) for idx, c in enumerate(classifiers):
# Delayed flights with Gradient-Boosted Trees # You've previously built a classifier for flights likely to be delayed using a Decision Tree. In this exercise you'll compare a Decision Tree model to a Gradient-Boosted Trees model. # The flights data have been randomly split into flights_train and flights_test. # Instructions # 100 XP # Import the classes required to create Decision Tree and Gradient-Boosted Tree classifiers. # Create Decision Tree and Gradient-Boosted Tree classifiers. Train on the training data. # Create an evaluator and calculate AUC on testing data for both classifiers. Which model performs better? # Find the number of trees and the relative importance of features in the Gradient-Boosted Tree classifier. # Import the classes required from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator # Create model objects and train on training data tree = DecisionTreeClassifier().fit(flights_train) gbt = GBTClassifier().fit(flights_train) # Compare AUC on testing data evaluator = BinaryClassificationEvaluator() evaluator.evaluate(tree.transform(flights_test)) evaluator.evaluate(gbt.transform(flights_test)) # Find the number of trees and the relative importance of features print(gbt.trees) print(gbt.featureImportances)
labelIndexer = StringIndexer( inputCol="Position", outputCol="indexedTarget") # Vectorindexer featureIndexer = VectorIndexer( inputCol="features", outputCol="indexedFeatures" , maxCategories=4) # Dividimos el dataset (training_df, test_df) = df.randomSplit([0.7, 0.3]) # Entrenamiento entrenador = DecisionTreeClassifier( labelCol="indexedTarget", featuresCol="indexedFeatures") # Creacion de pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, entrenador]) # Se entrena el modelo model = pipeline.fit(training_df) # Validacion predictions_df = model.transform(test_df) predictions_df.select( "indexedFeatures", "indexedTarget", "prediction", "rawPrediction").show() # Evaluador --> Accuracy evaluator = MulticlassClassificationEvaluator(