Esempio n. 1
0
"""Find out the max depth of the trained Decision Tree, and its total number of nodes."""

# YOUR CODE HERE
print(model)

"""It appears that the default settings of the Decision Tree implemented in MLlib did not allow us to train a very powerful model!

Before starting to train a Decision Tree, you can tune the max depth it can reach using the [setMaxDepth()](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier.setMaxDepth) method. Train 21 different DTs, varying the max depth from 0 to 20, endpoints included (i.e., [0, 20]). For each value of the parameter, print the accuracy achieved on the test set, and the number of nodes contained in the given DT.

**IMPORTANT:** this parameter sweep can take 30 minutes or more, depending on how busy is your Colab instance. Notice how the induction time grows super-linearly!
"""

# YOUR CODE HERE
for depth in range(30):
  dt = DecisionTreeClassifier()
  dt.setMaxDepth(depth)
  model = dt.fit(training)

  predictions = model.transform(training)
  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
  accuracy = evaluator.evaluate(predictions)
  print("The training accuracy of depth {} is {}".format(depth, accuracy))

  predictions = model.transform(test)
  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
  accuracy = evaluator.evaluate(predictions)
  print("The test accuracy of depth {} is {}".format(depth, accuracy))
  print(model)

"""Once you have working code for each cell above, **head over to Gradescope, read carefully the questions, and submit your solution for this Colab**!"""
    "pcg42", "pcg43", "pcg44", "pcg45", "pcg46", "sp1", "sp2", "sp3", "sp4",
    "sp5", "sp6", "sp7", "sp8", "sp9", "sp10", "sp11", "sp12", "sp13", "pg1",
    "pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11",
    "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2",
    "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max",
    "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max",
    "labCount_min", "labCount_ave", "labcount_months"
])
vecAssembler.setOutputCol("features")
print vecAssembler.explainParams()

from pyspark.ml.classification import DecisionTreeClassifier

aft = DecisionTreeClassifier()
aft.setLabelCol("Readmitlabel")
aft.setMaxDepth(30)

print aft.explainParams()

# COMMAND ----------

from pyspark.ml import Pipeline

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()

# Now we'll tell the pipeline to first create the feature vector, and then do the linear regression
lrPipeline.setStages([vecAssembler, aft])

# Pipelines are themselves Estimators -- so to use them we call fit:
lrPipelineModel = lrPipeline.fit(finaldf)
def main(): 
	root =  os.path.dirname(os.path.abspath(__file__))

	print("Digits Handwriting Recognition using Spark")
	print("Root file path is = %s" %root)
	conf = SparkConf().setAppName("OCR")
	sc = SparkContext(conf = conf)
	sc.setLogLevel("WARN")

	sqlContext = SQLContext(sc)


	print("loading dataset")
	trainRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist")
	testRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist.t")

	# check if rdd support toDF
	if not hasattr(trainRDD, "toDF"):
        	print("ERROR: RDD does not support toDF")
        	os.exit(1)


	## convert RDDs to data frames
	trainDF = trainRDD.toDF()
	testDF = testRDD.toDF()

	print("INFO: train dataframe count = %u" %trainDF.count())
	print("INFO: test dataframe count = %u" %testDF.count())

	indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
	dtc = DecisionTreeClassifier(labelCol="indexedLabel")

	pipeline = Pipeline(stages=[indexer, dtc])
	model = pipeline.fit(trainDF)


	## train multiple depth models
	variedMaxDepthModels = []
	
	print("Create varied depth CNN models [1..8]")
	for mdepth in xrange(1, 9):
		start = time.time()			

		## maximum depth
		dtc.setMaxDepth(mdepth)
		
		## create pipeline
		pipeline = Pipeline(stages = [indexer, dtc])
		
		## create the model
		model = pipeline.fit(trainDF)
		
		## add to varied container
		variedMaxDepthModels.append(model)

		end = time.time()

		print("trained a CNN depth of %u, duration = [%.3f] secs" %(mdepth, end - start))
	
	print("=================================================")

	## report model accuraries
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName="precision")
	
	## mdepth
	print("Evaluate all models precision")
	for mdepth in xrange(1, 9):
		model = variedMaxDepthModels[mdepth - 1]
		
		predictions  = model.transform(testDF)
		
		precision = evaluator.evaluate(predictions)
		
		print("CNN depth = %u, precision = %.3f" %(mdepth, precision))

				
		
	print("Finished processing %u digits" %testDF.count())