"""Find out the max depth of the trained Decision Tree, and its total number of nodes.""" # YOUR CODE HERE print(model) """It appears that the default settings of the Decision Tree implemented in MLlib did not allow us to train a very powerful model! Before starting to train a Decision Tree, you can tune the max depth it can reach using the [setMaxDepth()](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier.setMaxDepth) method. Train 21 different DTs, varying the max depth from 0 to 20, endpoints included (i.e., [0, 20]). For each value of the parameter, print the accuracy achieved on the test set, and the number of nodes contained in the given DT. **IMPORTANT:** this parameter sweep can take 30 minutes or more, depending on how busy is your Colab instance. Notice how the induction time grows super-linearly! """ # YOUR CODE HERE for depth in range(30): dt = DecisionTreeClassifier() dt.setMaxDepth(depth) model = dt.fit(training) predictions = model.transform(training) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("The training accuracy of depth {} is {}".format(depth, accuracy)) predictions = model.transform(test) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("The test accuracy of depth {} is {}".format(depth, accuracy)) print(model) """Once you have working code for each cell above, **head over to Gradescope, read carefully the questions, and submit your solution for this Colab**!"""
"pcg42", "pcg43", "pcg44", "pcg45", "pcg46", "sp1", "sp2", "sp3", "sp4", "sp5", "sp6", "sp7", "sp8", "sp9", "sp10", "sp11", "sp12", "sp13", "pg1", "pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11", "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2", "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max", "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max", "labCount_min", "labCount_ave", "labcount_months" ]) vecAssembler.setOutputCol("features") print vecAssembler.explainParams() from pyspark.ml.classification import DecisionTreeClassifier aft = DecisionTreeClassifier() aft.setLabelCol("Readmitlabel") aft.setMaxDepth(30) print aft.explainParams() # COMMAND ---------- from pyspark.ml import Pipeline # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline() # Now we'll tell the pipeline to first create the feature vector, and then do the linear regression lrPipeline.setStages([vecAssembler, aft]) # Pipelines are themselves Estimators -- so to use them we call fit: lrPipelineModel = lrPipeline.fit(finaldf)
def main(): root = os.path.dirname(os.path.abspath(__file__)) print("Digits Handwriting Recognition using Spark") print("Root file path is = %s" %root) conf = SparkConf().setAppName("OCR") sc = SparkContext(conf = conf) sc.setLogLevel("WARN") sqlContext = SQLContext(sc) print("loading dataset") trainRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist") testRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist.t") # check if rdd support toDF if not hasattr(trainRDD, "toDF"): print("ERROR: RDD does not support toDF") os.exit(1) ## convert RDDs to data frames trainDF = trainRDD.toDF() testDF = testRDD.toDF() print("INFO: train dataframe count = %u" %trainDF.count()) print("INFO: test dataframe count = %u" %testDF.count()) indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") dtc = DecisionTreeClassifier(labelCol="indexedLabel") pipeline = Pipeline(stages=[indexer, dtc]) model = pipeline.fit(trainDF) ## train multiple depth models variedMaxDepthModels = [] print("Create varied depth CNN models [1..8]") for mdepth in xrange(1, 9): start = time.time() ## maximum depth dtc.setMaxDepth(mdepth) ## create pipeline pipeline = Pipeline(stages = [indexer, dtc]) ## create the model model = pipeline.fit(trainDF) ## add to varied container variedMaxDepthModels.append(model) end = time.time() print("trained a CNN depth of %u, duration = [%.3f] secs" %(mdepth, end - start)) print("=================================================") ## report model accuraries evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName="precision") ## mdepth print("Evaluate all models precision") for mdepth in xrange(1, 9): model = variedMaxDepthModels[mdepth - 1] predictions = model.transform(testDF) precision = evaluator.evaluate(predictions) print("CNN depth = %u, precision = %.3f" %(mdepth, precision)) print("Finished processing %u digits" %testDF.count())