def getSampleImageDF(): return imageIO.readImagesWithCustomFn(path=_getSampleJPEGDir(), decode_f=imageIO.PIL_decode)
# conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ( # 'spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory', '8g')]) imageDir = "T://courses//BigData//data//flower_photos" # Load images # image_df = ImageSchema.readImages(imageDir, recursive = True).withColumn("label", lit(1)) # image_df.printSchema() # image_df.show(5) # train_df, test_df, _=image_df.randomSplit([0.1, 0.05, 0.85]) # read images using two methods tulips_df = ImageSchema.readImages(imageDir + "/tulips").withColumn( "label", lit(1)) daisy_df = imageIO.readImagesWithCustomFn( imageDir + "/daisy", decode_f=imageIO.PIL_decode).withColumn("label", lit(0)) # use larger training sets (e.g. [0.6, 0.4] for getting more images) tulips_train, tulips_test, _ = tulips_df.randomSplit([0.1, 0.05, 0.85]) # use larger training sets (e.g. [0.6, 0.4] for getting more images) daisy_train, daisy_test, _ = daisy_df.randomSplit([0.1, 0.05, 0.85]) train_df = tulips_train.unionAll(daisy_train) test_df = tulips_test.unionAll(daisy_test) # Under the hood, each of the partitions is fully loaded in memory, which may be expensive. # This ensure that each of the paritions has a small size. train_df = train_df.repartition(100) test_df = test_df.repartition(100)
# df_row = spark.createDataFrame(rdd) def assert_spark_df(df): assert isinstance(df, pyspark.sql.dataframe.DataFrame), "Not a Spark DF" def assert_spark_model(model): assert isinstance(model, pyspark.ml.PipelineModel), "Not a model" tulips_df = ImageSchema.readImages("tests/testtulips/").withColumn( "label", lit(1)) daisy_df = imageIO.readImagesWithCustomFn( "tests/testdaisy/", decode_f=imageIO.PIL_decode).withColumn("label", lit(0)) train_df = tulips_df.unionAll(daisy_df) def test_image_classifier_lr(): model, df_preds = op.dl.image_classifier_lr(train_df) assert_spark_model(model) assert_spark_df(df_preds) def test_evaluate_img_lr(): model, df_preds = op.dl.image_classifier_lr(train_df) result = op.dl.evaluate_image_classifier(train_df, model)
from pyspark.sql.functions import lit from sparkdl.image import imageIO from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator img_dir = "/home/lshang/Downloads/flower_photos" tulips_df = imageIO.readImagesWithCustomFn( img_dir + "/tulips", decode_f=imageIO.PIL_decode).withColumn("label", lit(1)) daisy_df = imageIO.readImagesWithCustomFn( img_dir + "/daisy", decode_f=imageIO.PIL_decode).withColumn("label", lit(0)) tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4]) daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4]) #dataframe for training a classification model train_df = tulips_train.unionAll(daisy_train) #dataframe for testing the classification model test_df = tulips_test.unionAll(daisy_test) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
from pyspark.sql.functions import lit from sparkdl.image import imageIO from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator img_dir = "/home/lshang/Downloads/personalities" jobs_df = imageIO.readImagesWithCustomFn( img_dir + "/jobs", decode_f=imageIO.PIL_decode).withColumn("label", lit(1)) zuckerberg_df = imageIO.readImagesWithCustomFn( img_dir + "/zuckerberg", decode_f=imageIO.PIL_decode).withColumn("label", lit(0)) jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4]) zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4]) #dataframe for training a classification model train_df = jobs_train.unionAll(zuckerberg_train) #dataframe for testing the classification model test_df = jobs_test.unionAll(zuckerberg_test) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")