def saveImageFeatures(images, filePath): from sparkdl import DeepImageFeaturizer # Build featurizer using DeepImageFeaturizer and the InceptionV3 model featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") # Transform images to pull out image (origin, height, width, nChannels, mode, data) and features (udt) features = featurizer.transform(images) # Push feature information into Parquet file format # This might take a few minutes dbutils.fs.mkdirs(filePath) # Extract only image file name (imgFileName) within our saved features features.select( "image.origin", "features").coalesce(2).write.mode("overwrite").parquet(filePath)
#categories udf_categorie = udf(parse_category, StringType()) df_img = df_img.withColumn('categorie', udf_categorie('path')) return df_img # # Loading #Loading of Df with path, images, and categories spark_df = load_data(path) # # Preprocessing # In[11]: from sparkdl import DeepImageFeaturizer # We'll use ResNet50 for the transformation featurizer = DeepImageFeaturizer(inputCol="image", outputCol="image_preprocessed", modelName="ResNet50") spark_df_preprocessed = featurizer.transform(spark_df).select( ['path', 'categorie', 'image_preprocessed']) # # Saving #Saving as parquet file spark_df_preprocessed.repartition(16).write.format("parquet").mode( 'overwrite').save(path_to_save + 'preprocessed_parquet')
# Prepare Test Data # Prepare Test Data tmpTestDf = readImages(imageDir + "train25_2") tmpTestRDD = tmpTestDf.rdd.map( lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0]))) tmptestX = tmpTestRDD.toDF() csvTestTmp = spark.read.format("csv").option( "header", "true").load(imageDir + "train25_2.csv") csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1]))) csvTest = csvTestRDD.toDF() finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDataFrame) del tmpTestDf del tmpTestRDD del tmpTestX del csvTestTmp del csvTestRDD del csvTest del finalTestDataFrame model_nb = OneVsRestModel.load(imageDir + 'model-naive-bayes') model_nb.fit(featureVector) model_nb.write().overwrite().save(imageDir + 'model-naive-bayes-retrained') print '***re-train complete***'
tmpTrainDf = readImages(imageDir + "/train25") #tmpTrainDf = readImages(imageDir + "/test1") tmpTrainRDD = tmpTrainDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0]))) tmpTrainX = tmpTrainRDD.toDF() csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/train25.csv") #csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv") csvTrainRDD = csvTrainTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1]))) csvTrain = csvTrainRDD.toDF() finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image) featurizer = DeepImageFeaturizer(inputCol="image",outputCol="features", modelName="InceptionV3") method = LogisticRegression(maxIter=50, regParam=0.05, elasticNetParam=0.3, labelCol="label") ovr = OneVsRest(classifier = method) featureVector = featurizer.transform(finalTrainDataFrame).persist() model_svm = ovr.fit(featureVector) model_svm.write().overwrite().save('hdfs://192.168.65.188:8020/paih/model-support-vector-machine') predictions = model_svm.transform(featureVector).persist() predictionAndLabels = predictions.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Train Data set accuracy with Support vector Machine = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels))) #predictions.show() #apply evaluator on constructed model for training data. #apply deep learning for feature extraction #apply genetic algo for feature selection #apply ml model(eg. lr) #apply genetic algorithm for feature selection
imageDir = imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/dst/resized/" labelZeroDf = readImages(imageDir + "tl0").withColumn("label", lit(0)) labelOneDf = readImages(imageDir + "tl1").withColumn("label", lit(1)) labelTwoDf = readImages(imageDir + "tl2").withColumn("label", lit(2)) labelThreeDf = readImages(imageDir + "tl3").withColumn("label", lit(3)) labelFourDf = readImages(imageDir + "tl4").withColumn("label", lit(4)) finalTestDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll( labelThreeDf).unionAll(labelFourDf) testSize = finalTestDf.count() print(str(testSize)) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDf) model_dc = OneVsRestModel.load(imageDir + 'model-decision-tree-classifier-new') predictions = model_dc.transform(featureVector) predictions.show() evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Data set accuracy with decision-tree-classifieer for " + str(testSize) + " images = " + str(accuracy) + " and error " + str(1 - accuracy)) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
labelZeroDf = readImages(imageDir + "l0").withColumn("label", lit(0)) labelOneDf = readImages(imageDir + "l1").withColumn("label", lit(1)) labelTwoDf = readImages(imageDir + "l2").withColumn("label", lit(2)) labelThreeDf = readImages(imageDir + "l3").withColumn("label", lit(3)) labelFourDf = readImages(imageDir + "l4").withColumn("label", lit(4)) finalTrainDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll( labelThreeDf).unionAll(labelFourDf) trainSize = finalTrainDf.count() print(str(trainSize)) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") method = DecisionTreeClassifier(labelCol="label", featuresCol="features") ovr = OneVsRest(classifier=method) featureVector = featurizer.transform(finalTrainDf) model_dc = ovr.fit(featureVector) model_dc.write().overwrite().save(imageDir + 'model-decision-tree-classifier-new') predictions = model_dc.transform(featureVector) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Train Data set accuracy with decision-tree-classifier for " + str(trainSize) + " images = " + str(accuracy) + " and error " + str(1 - accuracy))