Ejemplo n.º 1
0
def saveImageFeatures(images, filePath):
    from sparkdl import DeepImageFeaturizer

    # Build featurizer using DeepImageFeaturizer and the InceptionV3 model
    featurizer = DeepImageFeaturizer(inputCol="image",
                                     outputCol="features",
                                     modelName="InceptionV3")

    # Transform images to pull out image (origin, height, width, nChannels, mode, data) and features (udt)
    features = featurizer.transform(images)

    # Push feature information into Parquet file format
    # This might take a few minutes
    dbutils.fs.mkdirs(filePath)

    # Extract only image file name (imgFileName) within our saved features
    features.select(
        "image.origin",
        "features").coalesce(2).write.mode("overwrite").parquet(filePath)
Ejemplo n.º 2
0
    #categories
    udf_categorie = udf(parse_category, StringType())
    df_img = df_img.withColumn('categorie', udf_categorie('path'))

    return df_img


# # Loading

#Loading of Df with path, images, and categories
spark_df = load_data(path)

# # Preprocessing

# In[11]:

from sparkdl import DeepImageFeaturizer

# We'll use ResNet50 for the transformation
featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="image_preprocessed",
                                 modelName="ResNet50")
spark_df_preprocessed = featurizer.transform(spark_df).select(
    ['path', 'categorie', 'image_preprocessed'])

# # Saving

#Saving as parquet file
spark_df_preprocessed.repartition(16).write.format("parquet").mode(
    'overwrite').save(path_to_save + 'preprocessed_parquet')
Ejemplo n.º 3
0
# Prepare Test Data
# Prepare Test Data
tmpTestDf = readImages(imageDir + "train25_2")
tmpTestRDD = tmpTestDf.rdd.map(
    lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0])))
tmptestX = tmpTestRDD.toDF()
csvTestTmp = spark.read.format("csv").option(
    "header", "true").load(imageDir + "train25_2.csv")
csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1])))
csvTest = csvTestRDD.toDF()
finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image,
                                   'inner').drop(csvTest.image)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
featureVector = featurizer.transform(finalTestDataFrame)
del tmpTestDf
del tmpTestRDD
del tmpTestX
del csvTestTmp
del csvTestRDD
del csvTest
del finalTestDataFrame

model_nb = OneVsRestModel.load(imageDir + 'model-naive-bayes')

model_nb.fit(featureVector)
model_nb.write().overwrite().save(imageDir + 'model-naive-bayes-retrained')
print '***re-train complete***'
Ejemplo n.º 4
0
tmpTrainDf = readImages(imageDir + "/train25")
#tmpTrainDf = readImages(imageDir + "/test1")
tmpTrainRDD = tmpTrainDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0])))
tmpTrainX = tmpTrainRDD.toDF()
csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/train25.csv")
#csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv")
csvTrainRDD = csvTrainTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1])))
csvTrain = csvTrainRDD.toDF()
finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image)


featurizer = DeepImageFeaturizer(inputCol="image",outputCol="features", modelName="InceptionV3")

method = LogisticRegression(maxIter=50, regParam=0.05, elasticNetParam=0.3, labelCol="label")
ovr = OneVsRest(classifier = method)
featureVector = featurizer.transform(finalTrainDataFrame).persist()
model_svm = ovr.fit(featureVector)
model_svm.write().overwrite().save('hdfs://192.168.65.188:8020/paih/model-support-vector-machine')

predictions = model_svm.transform(featureVector).persist()
predictionAndLabels = predictions.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Train Data set accuracy with Support vector Machine = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels)))
#predictions.show()

#apply evaluator on constructed model for training data.

#apply deep learning for feature extraction
#apply genetic algo for feature selection
#apply ml model(eg. lr)
#apply genetic algorithm for feature selection
Ejemplo n.º 5
0
imageDir = imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/dst/resized/"

labelZeroDf = readImages(imageDir + "tl0").withColumn("label", lit(0))
labelOneDf = readImages(imageDir + "tl1").withColumn("label", lit(1))
labelTwoDf = readImages(imageDir + "tl2").withColumn("label", lit(2))
labelThreeDf = readImages(imageDir + "tl3").withColumn("label", lit(3))
labelFourDf = readImages(imageDir + "tl4").withColumn("label", lit(4))
finalTestDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll(
    labelThreeDf).unionAll(labelFourDf)

testSize = finalTestDf.count()
print(str(testSize))
featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
featureVector = featurizer.transform(finalTestDf)
model_dc = OneVsRestModel.load(imageDir + 'model-decision-tree-classifier-new')

predictions = model_dc.transform(featureVector)
predictions.show()

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Data set accuracy with decision-tree-classifieer for " +
      str(testSize) + " images = " + str(accuracy) + " and error " +
      str(1 - accuracy))

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
Ejemplo n.º 6
0
labelZeroDf = readImages(imageDir + "l0").withColumn("label", lit(0))
labelOneDf = readImages(imageDir + "l1").withColumn("label", lit(1))
labelTwoDf = readImages(imageDir + "l2").withColumn("label", lit(2))
labelThreeDf = readImages(imageDir + "l3").withColumn("label", lit(3))
labelFourDf = readImages(imageDir + "l4").withColumn("label", lit(4))
finalTrainDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll(
    labelThreeDf).unionAll(labelFourDf)

trainSize = finalTrainDf.count()
print(str(trainSize))

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
method = DecisionTreeClassifier(labelCol="label", featuresCol="features")
ovr = OneVsRest(classifier=method)
featureVector = featurizer.transform(finalTrainDf)
model_dc = ovr.fit(featureVector)
model_dc.write().overwrite().save(imageDir +
                                  'model-decision-tree-classifier-new')
predictions = model_dc.transform(featureVector)

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Train Data set accuracy with decision-tree-classifier for " +
      str(trainSize) + " images = " + str(accuracy) + " and error " +
      str(1 - accuracy))