Beispiel #1
0
def getSampleImageDF():
    return imageIO.readImagesWithCustomFn(path=_getSampleJPEGDir(),
                                          decode_f=imageIO.PIL_decode)
# conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), (
#    'spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory', '8g')])

imageDir = "T://courses//BigData//data//flower_photos"

# Load images
# image_df = ImageSchema.readImages(imageDir, recursive = True).withColumn("label", lit(1))
# image_df.printSchema()
# image_df.show(5)
# train_df, test_df, _=image_df.randomSplit([0.1, 0.05, 0.85])

# read images using two methods
tulips_df = ImageSchema.readImages(imageDir + "/tulips").withColumn(
    "label", lit(1))
daisy_df = imageIO.readImagesWithCustomFn(
    imageDir + "/daisy",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(0))

# use larger training sets (e.g. [0.6, 0.4] for getting more images)
tulips_train, tulips_test, _ = tulips_df.randomSplit([0.1, 0.05, 0.85])
# use larger training sets (e.g. [0.6, 0.4] for getting more images)
daisy_train, daisy_test, _ = daisy_df.randomSplit([0.1, 0.05, 0.85])

train_df = tulips_train.unionAll(daisy_train)
test_df = tulips_test.unionAll(daisy_test)

# Under the hood, each of the partitions is fully loaded in memory, which may be expensive.
# This ensure that each of the paritions has a small size.
train_df = train_df.repartition(100)
test_df = test_df.repartition(100)
Beispiel #3
0
# df_row = spark.createDataFrame(rdd)


def assert_spark_df(df):
    assert isinstance(df, pyspark.sql.dataframe.DataFrame), "Not a Spark DF"


def assert_spark_model(model):
    assert isinstance(model, pyspark.ml.PipelineModel), "Not a model"


tulips_df = ImageSchema.readImages("tests/testtulips/").withColumn(
    "label", lit(1))
daisy_df = imageIO.readImagesWithCustomFn(
    "tests/testdaisy/",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(0))

train_df = tulips_df.unionAll(daisy_df)


def test_image_classifier_lr():
    model, df_preds = op.dl.image_classifier_lr(train_df)

    assert_spark_model(model)
    assert_spark_df(df_preds)


def test_evaluate_img_lr():
    model, df_preds = op.dl.image_classifier_lr(train_df)
    result = op.dl.evaluate_image_classifier(train_df, model)
from pyspark.sql.functions import lit
from sparkdl.image import imageIO
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

img_dir = "/home/lshang/Downloads/flower_photos"
tulips_df = imageIO.readImagesWithCustomFn(
    img_dir + "/tulips",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(1))
daisy_df = imageIO.readImagesWithCustomFn(
    img_dir + "/daisy",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(0))

tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4])
daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4])

#dataframe for training a classification model
train_df = tulips_train.unionAll(daisy_train)

#dataframe for testing the classification model
test_df = tulips_test.unionAll(daisy_test)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
lr = LogisticRegression(maxIter=20,
                        regParam=0.05,
                        elasticNetParam=0.3,
                        labelCol="label")
from pyspark.sql.functions import lit
from sparkdl.image import imageIO
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

img_dir = "/home/lshang/Downloads/personalities"
jobs_df = imageIO.readImagesWithCustomFn(
    img_dir + "/jobs",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(1))
zuckerberg_df = imageIO.readImagesWithCustomFn(
    img_dir + "/zuckerberg",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(0))

jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4])
zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4])

#dataframe for training a classification model
train_df = jobs_train.unionAll(zuckerberg_train)

#dataframe for testing the classification model
test_df = jobs_test.unionAll(zuckerberg_test)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
lr = LogisticRegression(maxIter=20,
                        regParam=0.05,
                        elasticNetParam=0.3,
                        labelCol="label")
def getSampleImageDF():
    return imageIO.readImagesWithCustomFn(path=_getSampleJPEGDir(), decode_f=imageIO.PIL_decode)