def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame( [(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"], ) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol()) self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol()) self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) for m, n in zip(model.models, loadedModel.models): self.assertEqual(m.uid, n.uid)
imageDir = imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/resized/" labelZeroDf = readImages(imageDir + "tl0").withColumn("label", lit(0)) labelOneDf = readImages(imageDir + "tl1").withColumn("label", lit(1)) labelTwoDf = readImages(imageDir + "tl2").withColumn("label", lit(2)) labelThreeDf = readImages(imageDir + "tl3").withColumn("label", lit(3)) labelFourDf = readImages(imageDir + "tl4").withColumn("label", lit(4)) finalTestDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll( labelThreeDf).unionAll(labelFourDf) testSize = finalTestDf.count() featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDf) model_nb = OneVsRestModel.load(imageDir + 'model-naive-bayes-new') predictions = model_nb.transform(featureVector) predictions.show(predictions.count()) evaluator = MulticlassClassificationEvaluator(metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Data set accuracy with Naive Bayes for " + str(testSize) + "images = " + str(accuracy) + " and error " + str(1 - accuracy)) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision") weightedPrecision = evaluator.evaluate(predictions) print("Test Data set weightedPrecision with Naive Bayes for " + str(testSize) + " images = " + str(weightedPrecision))
"header", "true").load(imageDir + "test25.csv") csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1]))) csvTest = csvTestRDD.toDF() finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDataFrame) del tmpTestDf del tmpTestRDD del tmpTestX del csvTestTmp del csvTestRDD del csvTest del finalTestDataFrame #print gc.collect() #model_nb = OneVsRestModel.load('hdfs://192.168.65.188:8020/paih/model-naive-bayes') model_dc = OneVsRestModel.load(imageDir + 'model-decision-tree-classifier') predictions = model_dc.transform(featureVector) predictions.show() print '***Transform complete***' evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test Data set accuracy with decision tree classifier for 25 images = " + str(evaluator.evaluate(predictions)) + " and error " + str(1 - evaluator.evaluate(predictions)))
#imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work" imageDir = "hdfs://192.168.65.188:8020/paih/" def getFileName (filePath) : fileName = os.path.basename(filePath).split(".")[0] return fileName # Prepare Test Data tmpTestDf = readImages(imageDir + "/test5") tmpTestRDD = tmpTestDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0]))) tmptestX = tmpTestRDD.toDF() csvTestTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test5.csv") csvTestRDD = csvTestTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1]))) csvTest = csvTestRDD.toDF() finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image) featurizer = DeepImageFeaturizer(inputCol="image",outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDataFrame) model_dc = OneVsRestModel.load('hdfs://192.168.65.188:8020/paih/model-dicision-tree-classifier') predictions = model_dc.transform(featureVector) predictions.persist() predictions.select("filePath", "prediction").show(truncate=False) predictionAndLabels = predictions.select("prediction", "label") #predictionAndLabels.show() evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test Data set accuracy with Decision tree Classifier = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels)))
# Prepare Test Data tmpTestDf = readImages(imageDir + "/test5") tmpTestRDD = tmpTestDf.rdd.map( lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0]))) tmptestX = tmpTestRDD.toDF() csvTestTmp = spark.read.format("csv").option( "header", "true").load(imageDir + "/test5.csv") csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1]))) csvTest = csvTestRDD.toDF() finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDataFrame) #model_nb = OneVsRestModel.load('hdfs://192.168.65.188:8020/paih/model-naive-bayes') model_gbt = OneVsRestModel.load(imgDir + '/model-gradiant-boosted-tre-classifier') predictions = model_gbt.transform(featureVector) print '***Transform complete***' predictionAndLabels = predictions.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test Data set accuracy with gradiant boosted tree classifier = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels)))