Beispiel #1
0
 def test_read_images_multiple_times(self):
     # This test case is to check if `ImageSchema.readImages` tries to
     # initiate Hive client multiple times. See SPARK-22651.
     data_path = 'data/mllib/images/origin/kittens'
     ImageSchema.readImages(data_path,
                            recursive=True,
                            dropImageFailures=True)
     ImageSchema.readImages(data_path,
                            recursive=True,
                            dropImageFailures=True)
Beispiel #2
0
    def test_read_images(self):
        data_path = "data/mllib/images/origin/kittens"
        df = (self.spark.read.format("image").option("dropInvalid",
                                                     True).option(
                                                         "recursiveFileLookup",
                                                         True).load(data_path))
        self.assertEqual(df.count(), 4)
        first_row = df.take(1)[0][0]
        # compare `schema.simpleString()` instead of directly compare schema,
        # because the df loaded from datasource may change schema column nullability.
        self.assertEqual(df.schema.simpleString(),
                         ImageSchema.imageSchema.simpleString())
        self.assertEqual(df.schema["image"].dataType.simpleString(),
                         ImageSchema.columnSchema.simpleString())
        array = ImageSchema.toNDArray(first_row)
        self.assertEqual(len(array), first_row[1])
        self.assertEqual(ImageSchema.toImage(array, origin=first_row[0]),
                         first_row)
        expected = {
            "CV_8UC3": 16,
            "Undefined": -1,
            "CV_8U": 0,
            "CV_8UC1": 0,
            "CV_8UC4": 24
        }
        self.assertEqual(ImageSchema.ocvTypes, expected)
        expected = ["origin", "height", "width", "nChannels", "mode", "data"]
        self.assertEqual(ImageSchema.imageFields, expected)
        self.assertEqual(ImageSchema.undefinedImageType, "Undefined")

        with QuietTest(self.sc):
            self.assertRaisesRegex(
                TypeError,
                "image argument should be pyspark.sql.types.Row; however",
                lambda: ImageSchema.toNDArray("a"),
            )

        with QuietTest(self.sc):
            self.assertRaisesRegex(
                ValueError,
                "image argument should have attributes specified in",
                lambda: ImageSchema.toNDArray(Row(a=1)),
            )

        with QuietTest(self.sc):
            self.assertRaisesRegex(
                TypeError,
                "array argument should be numpy.ndarray; however, it got",
                lambda: ImageSchema.toImage("a"),
            )
Beispiel #3
0
def getDataFrame(img_dir):
  dic = {}
  df_train = []
  df_test = []
  count = 0

  for root, directories, files in os.walk(img_dir):
    for file in directories:
      temp_df = ImageSchema.readImages(img_dir + "/" + file).withColumn("label", lit(count))
      train_df, test_df = temp_df.randomSplit([0.6, 0.4])
      df_train.append(train_df)
      df_test.append(test_df)
      if dic.get(count,None):
        continue
      else:
        dic[count] = file
        count += 1

  trained_df = df_train[0]
  for i in range(1, len(df_train)):
    trained_df = trained_df.unionAll(df_train[i])

  tested_df = df_test[0]
  for i in range(1, len(df_test)):
    tested_df = tested_df.unionAll(df_test[i])

  return trained_df, tested_df, dic
    def _create_image_df_with_label(image_folder):
        """
        Creates a image data frame for a given image class (same label)

        :param image_folder: Folder which contains a single type of images
        :return: DataFrame with label and image columns (all with same label)
        """

        label = int(image_folder.stem[1:])
        path = ImageUtils._find_images_path(image_folder)

        return ImageSchema.readImages(path).withColumn('label', functions.lit(label))
Beispiel #5
0
def createDataFrame(spark, sc):
    sparkHomeDir = "file:/Users/beginspark/Apps/spark"

    # 1. 외부 데이터소스로부터 데이터프레임 생성
    df1 = spark.read.json(sparkHomeDir +
                          "/examples/src/main/resources/people.json")
    df2 = spark.read.parquet(sparkHomeDir +
                             "/examples/src/main/resources/users.parquet")
    df3 = spark.read.text(sparkHomeDir +
                          "/examples/src/main/resources/people.txt")

    # 2. 로컬 컬렉션으로부터 데이터프레임 생성 (ex5-17)
    row1 = Row(name="hayoon", age=7, job="student")
    row2 = Row(name="sunwoo", age=13, job="student")
    row3 = Row(name="hajoo", age=5, job="kindergartener")
    row4 = Row(name="jinwoo", age=13, job="student")
    data = [row1, row2, row3, row4]
    df4 = spark.createDataFrame(data)

    # 3. 기존 RDD로부터 데이터프레임 생성 (ex5-20)
    rdd = spark.sparkContext.parallelize(data)
    df5 = spark.createDataFrame(data)

    # 4. 스키마 지정을 통한 데이터프레임 생성(ex5-23)
    sf1 = StructField("name", StringType(), True)
    sf2 = StructField("age", IntegerType(), True)
    sf3 = StructField("job", StringType(), True)
    schema = StructType([sf1, sf2, sf3])
    r1 = ("hayoon", 7, "student")
    r2 = ("sunwoo", 13, "student")
    r3 = ("hajoo", 5, "kindergartener")
    r4 = ("jinwoo", 13, "student")
    rows = [r1, r2, r3, r4]
    df6 = spark.createDataFrame(rows, schema)

    # 5. 이미지를 이용한 데이터프레임 생성
    path = sparkHomeDir + "/data/mllib/images"
    recursive = True
    numPartitions = 2
    dropImageFailures = True
    sampleRatio = 1.0
    seed = 0
    imgdf = ImageSchema.readImages(path, recursive, numPartitions,
                                   dropImageFailures, sampleRatio, seed)

    imgdf = imgdf.select(imgdf["image.origin"], imgdf["image.height"],
                         imgdf["image.width"], imgdf["image.nChannels"],
                         imgdf["image.mode"])
Beispiel #6
0
 def image_predictor(path,
                     input_col="image",
                     output_col="predicted_labels",
                     model_name="InceptionV3",
                     decode_predictions=True,
                     topK=10):
     image_df = ImageSchema.readImages(path)
     predictor = DeepImagePredictor(inputCol=input_col,
                                    outputCol=output_col,
                                    modelName=model_name,
                                    decodePredictions=decode_predictions,
                                    topK=topK)
     preds = predictor.transform(image_df)
     firstelement = udf(lambda v: (str(v[0][1]), float(v[0][2])),
                        ArrayType(StringType()))
     return preds.select(
         firstelement('predicted_labels').alias("predicted_labels"))
Beispiel #7
0
    def test_read_images(self):
        data_path = 'data/mllib/images/origin/kittens'
        df = ImageSchema.readImages(data_path,
                                    recursive=True,
                                    dropImageFailures=True)
        self.assertEqual(df.count(), 4)
        first_row = df.take(1)[0][0]
        array = ImageSchema.toNDArray(first_row)
        self.assertEqual(len(array), first_row[1])
        self.assertEqual(ImageSchema.toImage(array, origin=first_row[0]),
                         first_row)
        self.assertEqual(df.schema, ImageSchema.imageSchema)
        self.assertEqual(df.schema["image"].dataType, ImageSchema.columnSchema)
        expected = {
            'CV_8UC3': 16,
            'Undefined': -1,
            'CV_8U': 0,
            'CV_8UC1': 0,
            'CV_8UC4': 24
        }
        self.assertEqual(ImageSchema.ocvTypes, expected)
        expected = ['origin', 'height', 'width', 'nChannels', 'mode', 'data']
        self.assertEqual(ImageSchema.imageFields, expected)
        self.assertEqual(ImageSchema.undefinedImageType, "Undefined")

        with QuietTest(self.sc):
            self.assertRaisesRegexp(
                TypeError,
                "image argument should be pyspark.sql.types.Row; however",
                lambda: ImageSchema.toNDArray("a"))

        with QuietTest(self.sc):
            self.assertRaisesRegexp(
                ValueError,
                "image argument should have attributes specified in",
                lambda: ImageSchema.toNDArray(Row(a=1)))

        with QuietTest(self.sc):
            self.assertRaisesRegexp(
                TypeError,
                "array argument should be numpy.ndarray; however, it got",
                lambda: ImageSchema.toImage("a"))
Beispiel #8
0
# MAGIC
# MAGIC ### Prepare training and validation dataframes
# MAGIC
# MAGIC Deep Learning Pipelines require training data to be loaded into Spark DataFrames. The below code utilizes Spark's native support for image data to load 6000 training images to a DataFrame. It than adds a new column called `label` that annotates an image with a type of land it depicts. The  label is extracted from the pathname of the image.

# COMMAND ----------

# MAGIC %md
# MAGIC #### Load training images to a dataframe

# COMMAND ----------

from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit

img_df = ImageSchema.readImages(img_dir + 'train', recursive=True)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Add the label column and split data into training and validation DataFrames.

# COMMAND ----------

from pyspark.sql.functions import regexp_extract, col
from pyspark.ml.feature import StringIndexer

# Add a label columns
img_labeled = img_df.withColumn(
    'label', regexp_extract(col('image.origin'), '(.)(train/)(\w+)', 3))
# Split a dataframe into training and validation dataframes
Beispiel #9
0
def load_images_path_and_shuffle():

    annual_crop_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(0))
    forest_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(1))
    herb_veg_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(2))
    highway_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(3))
    industrial_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(4))
    pasture_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(5))
    perm_crop_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(6))
    residential_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(7))
    river_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(8))
    sea_lake_df = ImageSchema.readImages("path_to_images").withColumn("label", lit(9))

    annual_crop_train, annual_crop_vali = annual_crop_df.randomSplit([0.4, 0.6])
    forest_train, forest_vali = forest_df.randomSplit([0.4, 0.6])
    herb_veg_train, herb_veg_vail = herb_veg_df.randomSplit([0.4, 0.6])
    highway_train, highway_vali = highway_df.randomSplit([0.4, 0.6])
    industrial_train, industrial_vali = industrial_df.randomSplit([0.4, 0.6])
    pasture_train, pasture_vali = pasture_df.randomSplit([0.4, 0.6])
    perm_crop_train, perm_crop_vali = perm_crop_df.randomSplit([0.4, 0.6])
    residential_train, residential_vali = residential_df.randomSplit([0.4, 0.6])
    river_train, river_vali = river_df.randomSplit([0.4, 0.6])
    sea_lake_train, sea_lake_vali = sea_lake_df.randomSplit([0.4, 0.6])

    train_df_phase1 = annual_crop_train.union(forest_train)
    vali_df_pase1 = annual_crop_vali.union(forest_vali)

    train_df_phase2 = train_df_phase1.union(herb_veg_train)
    vali_df_pase2 = vali_df_pase1.union(herb_veg_vail)

    shuffle(train_df_phase2)
    shuffle(vali_df_pase2)

    train_df_phase3 = train_df_phase2.union(highway_train)
    vali_df_phase3 = vali_df_pase2.union(highway_vali)

    train_df_phase4 = train_df_phase3.union(industrial_train)
    vali_df_phase4 = vali_df_phase3.union(industrial_vali)

    shuffle(train_df_phase4)
    shuffle(vali_df_phase4)

    train_df_phase5 = train_df_phase4.union(pasture_train)
    vali_df_phase5 = vali_df_phase4.union(pasture_vali)

    shuffle(train_df_phase5)
    shuffle(vali_df_phase5)

    train_df_phase6 = train_df_phase5.union(perm_crop_train)
    vali_df_phase6 = vali_df_phase5.union(perm_crop_vali)

    train_df_phase7 = train_df_phase6.union(residential_train)
    vali_df_phase7 = vali_df_phase6.union(residential_vali)

    shuffle(train_df_phase7)
    shuffle(vali_df_phase7)

    train_df_phase8 = train_df_phase7.union(river_train)
    vali_df_phase8 = vali_df_phase7.union(river_vali)

    train_df = train_df_phase8.union(sea_lake_train)
    vali_df = vali_df_phase8.union(sea_lake_vali)

    train = shuffle(train_df)
    vali = shuffle(vali_df)

    for img in train[0]
        element = preprocess_img(img)
        processed_images_train.append(element)

    for img in sorted(range(len(vali[0]), key = vali.__getitem__))
        element = preprocess_img(img)
        processed_images_vali.append(element)
        image_uri_df = sqlContext.createDataFrame(local_rows)
        return image_uri_df


    label_cardinality = 2

    label_list = ['Tap', 'Teapot']

    label_cardinality = len(label_list)
    label_nums = list(range(label_cardinality))




    banana_image_df = ImageSchema.readImages("hdfs://ec2-18-235-62-224.compute-1.amazonaws.com:9000/OID/Dataset/test/Banana").withColumn("label", lit(1))


    # banana_image_df = banana_image_df.withColumn("prefix", lit('Entity/data/food/fruit/'))

    accordion_image_df = ImageSchema.readImages("hdfs://ec2-18-235-62-224.compute-1.amazonaws.com:9000/OID/Dataset/test/Accordion").withColumn("label", lit(0))

    # accordion_image_df = accordion_image_df.withColumn("prefix", lit('Entity/data/food/fruit/'))


    banana_train, banana_test, _ = banana_image_df.randomSplit([0.99, 0.005, 0.005])  # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters)
    accordion_train, accordion_test, _ = accordion_image_df.randomSplit([0.99, 0.005, 0.005])     # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters)

    train_df = accordion_train.unionAll(banana_train)
    test_df = accordion_test.unionAll(accordion_train)
Beispiel #11
0
import logging

from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit, col, udf
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(
    "Algortimo de clasificacion multiclase").getOrCreate()

path = "./resources/"
angry_df = ImageSchema.readImages(path + "0/").withColumn("label", lit(0))
happy_df = ImageSchema.readImages(path + "3/").withColumn("label", lit(1))
sad_df = ImageSchema.readImages(path + "4/").withColumn("label", lit(2))

sc = spark.sparkContext

log4jLogger = sc._jvm.org.apache.log4j
log = log4jLogger.Logger.getLogger(__name__)

log.info("pyspark script logger initialized")

df1 = angry_df.union(happy_df).union(sad_df)

parse_ = udf(lambda a: Vectors.dense(a), VectorUDT())
df = df1.withColumn("features", parse_(df1["image.data"]))

train, test, _ = df.randomSplit([0.1, 0.05, 0.85])
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

__author__ = "Jillur Quddus"
__credits__ = ["Jillur Quddus"]
__version__ = "1.0.0"
_maintainer__ = "Jillur Quddus"
__email__ = "*****@*****.**"
__status__ = "Development"

# (2) Create a Spark Session using the Spark Context instantiated from spark-submit
spark = SparkSession.builder.appName("Convolutional Neural Networks - Transfer Learning - Image Recognition").getOrCreate()

# (3) Load the Plane and Bird images into Spark DataFrames and define a literal label column
path_to_img_directory = '/data/workspaces/jillur.quddus/jupyter/notebooks/Machine-Learning-with-Apache-Spark-QuickStart-Guide/chapter07/data/image-recognition-data'
birds_df = ImageSchema.readImages(path_to_img_directory + "/birds").withColumn("label", lit(0))
planes_df = ImageSchema.readImages(path_to_img_directory + "/planes").withColumn("label", lit(1))

# (4) Create Training and Test DataFrames respectively
planes_train_df, planes_test_df = planes_df.randomSplit([0.75, 0.25], seed=12345)
birds_train_df, birds_test_df = birds_df.randomSplit([0.75, 0.25], seed=12345)
train_df = planes_train_df.unionAll(birds_train_df)
test_df = planes_test_df.unionAll(birds_test_df)

# (5) Transform the Images into Numeric Feature Vectors using Transfer Learning and the pre-trained InceptionV3 Convolutional Neural Network
featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")

# (6) Train a Logistic Regression Model to classify our images
logistic_regression = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")

# (7) Execute the Featurizer and Logistic Regression estimator within a Pipeline to generate the Trained Model
from pyspark.sql import SparkSession

spark = SparkSession.builder \
      .appName("ImageClassification") \
      .config("spark.executor.memory", "70g") \
      .config("spark.driver.memory", "50g") \
      .config("spark.memory.offHeap.enabled",True) \
      .config("spark.memory.offHeap.size","16g") \
      .getOrCreate()

import pyspark.sql.functions as f
import sparkdl as dl
from pyspark.ml.image import ImageSchema

dfbuses = ImageSchema.readImages('data/buses/').withColumn('label', f.lit(0))
dfcars = ImageSchema.readImages('data/cars/').withColumn('label', f.lit(1))

dfbuses.show(5)
dfcars.show(5)

trainDFbuses, testDFbuses = dfbuses.randomSplit([0.60, 0.40], seed=123)
trainDFcars, testDFcars = dfcars.randomSplit([0.60, 0.40], seed=122)

trainDF = trainDFbuses.unionAll(trainDFcars)
testDF = testDFbuses.unionAll(testDFcars)

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
vectorizer = dl.DeepImageFeaturizer(inputCol="image",
                                    outputCol="features",
Beispiel #14
0
import glob
fs = glob.glob("flower_photos/sample/*.jpg")

import IPython.display as dp

# create list of image objects
images = []
for ea in fs:
    images.append(dp.Image(filename=ea, format='png'))

# display all images
for ea in images:
    dp.display_png(ea)

from pyspark.ml.image import ImageSchema
image_df = ImageSchema.readImages("flower_photos/sample/")
image_df.show()

from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit
from sparkdl.image import imageIO

from keras.applications import InceptionV3

model = InceptionV3(weights="imagenet")
model.save('model-full.h5')  # saves to the local filesystem

from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.image import img_to_array, load_img
import numpy as np
from pyspark.sql.types import StringType, StructType, StructField, ArrayType, FloatType
    plt.imshow(two_d, interpolation='nearest')
    return plt


# get an image
img = gen_image(X_train[0])

# save image as png
img.savefig('/dbfs/mnt/' + account_name + '/' + container_name +
            '/sample_mnist_img.png',
            mode="overwrite")
plt.close()

# open png and display
from pyspark.ml.image import ImageSchema
image_df = ImageSchema.readImages('/mnt/' + account_name + '/' +
                                  container_name + '/sample_mnist_img.png')
display(image_df)

# start the run
run = exp.start_logging()

# train a model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# predict on test
y_hat = clf.predict(X_test)

# calculate accuracy on the prediction
acc = np.average(y_hat == y_test)
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
from functools import reduce
import seaborn as sns
import numpy as np
import itertools

# create spark session
spark = SparkSession.builder.appName('BD Recognizer').getOrCreate()

# loaded image
zero_df = ImageSchema.readImages("images/0").withColumn("label", lit(0))
one_df = ImageSchema.readImages("images/1").withColumn("label", lit(1))
two_df = ImageSchema.readImages("images/2").withColumn("label", lit(2))
three_df = ImageSchema.readImages("images/3").withColumn("label", lit(3))
four_df = ImageSchema.readImages("images/4").withColumn("label", lit(4))
five_df = ImageSchema.readImages("images/5").withColumn("label", lit(5))
six_df = ImageSchema.readImages("images/6").withColumn("label", lit(6))
seven_df = ImageSchema.readImages("images/7").withColumn("label", lit(7))
eight_df = ImageSchema.readImages("images/8").withColumn("label", lit(8))
nine_df = ImageSchema.readImages("images/9").withColumn("label", lit(9))

# merge data frame
dataframes = [
    zero_df, one_df, two_df, three_df, four_df, five_df, six_df, seven_df,
    eight_df, nine_df
]
Beispiel #17
0
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
import tensorflow as tf
import keras
from PIL import Image
import sparkdl
from pyspark.ml.image import ImageSchema
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

# Loading Train Images and creating Labels as 0 for Normal and 1 for Pneumonia images

Train_normal = ImageSchema.readImages("s3://image-class/train/NORMAL/")
Train_pneumonia = ImageSchema.readImages("s3://image-class/train/PNEUMONIA/")
Train_normal = Train_normal.withColumn("label", lit(0))
Train_pneumonia = Train_pneumonia.withColumn("label", lit(1))

# Combining all the Train Images into a single Train Dataset
Train_images = Train_pneumonia.union(Train_normal)

# Example of a Pneumonia Image loaded from the S3 Bucket
Train_pneumonia.first()

# Pictures of the Train Normal Chest X-Ray loaded from the S3 Bucket
display(Train_normal)

# Loading Test Images and Creating Labels as 0 for Normal Images and 1 for Pneumonia Images Test data
Test_normal = ImageSchema.readImages("s3://image-class/test/NORMAL/")
###### Loading the images from the input directory to MongoDB ######

from pyspark.ml.image import ImageSchema
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.window import Window

sparkTrain = SparkSession \
    .builder \
    .appName("ElecNET") \
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/ElecNet.ImgColl") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/ElecNet.ImgColl") \
    .getOrCreate()

#Reading images from the input directory
df = ImageSchema.readImages('data', recursive=True, dropImageFailures=True)
paths = df.select(df['image']['origin'].alias('filename'),
                  df['image']['data'].alias('image_bytes'),
                  df['image']['width'].alias('width'),
                  df['image']['height'].alias('height'))
split_col_filename = functions.split(paths['filename'], ':')
split_col_label = functions.split(paths['filename'], '-')
paths = paths.withColumn('category', split_col_label.getItem(1))
paths = paths.withColumn('filepath', split_col_filename.getItem(1))
paths = paths.select(paths['filepath'], paths['category'],
                     paths['image_bytes'], paths['width'], paths['height'])

#Creating train and test sets
splits = paths.randomSplit(weights=[0.8, 0.2])
train = splits[0]
test = splits[1]
# coding: utf-8

# ## Leitura das imagens

# In[1]:

from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit

IMG_DIR = "chest_xray/"

train_df_normal = ImageSchema.readImages(IMG_DIR + "/train/NORMAL").withColumn(
    "label", lit(0))
train_df_pneumonia = ImageSchema.readImages(IMG_DIR +
                                            "/train/PNEUMONIA").withColumn(
                                                "label", lit(1))
train_df = train_df_normal.union(train_df_pneumonia)

test_df_normal = ImageSchema.readImages(IMG_DIR + "/test/NORMAL").withColumn(
    "label", lit(0))
test_df_pneumonia = ImageSchema.readImages(IMG_DIR +
                                           "/test/PNEUMONIA").withColumn(
                                               "label", lit(1))
test_df = test_df_normal.union(test_df_pneumonia)

# ## Treino do modelo

# In[ ]:

from sparkdl import DeepImageFeaturizer
from pyspark.ml import Pipeline
Beispiel #20
0
                np.log(box[2] / anchors[best_anchor][0]),
                np.log(box[3] / anchors[best_anchor][1]), box_class
            ],
                                    dtype=np.float32)
            matching_true_boxes[i, j, best_anchor] = adjusted_box
    return detectors_mask, matching_true_boxes


# COMMAND ----------

# MAGIC %md
# MAGIC # ImageSchema

# COMMAND ----------

images_df = ImageSchema.readImages('/mnt/roy/object-detection/images/',
                                   numPartitions=16)

# COMMAND ----------

# MAGIC %md
# MAGIC # Prediction on `test.jpg`
# MAGIC Use it later to assert prediction using `UDF`

# COMMAND ----------

test_row = images_df.where(
    "image.origin='dbfs:/mnt/roy/object-detection/images/test.jpg'").take(
        1)[0][0]
array = ImageSchema.toNDArray(test_row)

# COMMAND ----------
def get_a_df(fpath):
    """将图片文件变成spark的DataFrame模型,该模型可以支持sql操作
    fpath:文件子路径和图片的label,将DATAPATH/fpath 下的图片读出来 并设置Label为fpath值"""
    dftemp = ImageSchema.readImages(DATAPATH+"/"+str(fpath)).withColumn("Label",lit(fpath))
    df_train, df_test = dftemp.randomSplit([.8, .2])
    return df_train, df_test
Beispiel #22
0
sc = SparkContext(conf=conf)

# Add in the sparkdl Dependancies
sys.path.insert(
    0,
    "/home/cdsw/spark-deep-learning/target/scala-2.11/spark-deep-learning-assembly-1.5.1-SNAPSHOT-spark2.4.jar"
)

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

from sparkdl import DeepImageFeaturizer

from pyspark.ml.image import ImageSchema
label1_df = ImageSchema.readImages("data/personalities/jobs").withColumn(
    "label", lit(0))
label2_df = ImageSchema.readImages("data/personalities/zuckerberg").withColumn(
    "label", lit(1))
train1_df, test1_df = label1_df.randomSplit([0.6, 0.4])
train2_df, test2_df = label2_df.randomSplit([0.6, 0.4])
train1_df.show()
test1_df.show()

train_images_df = train1_df.unionAll(train2_df)
test_images_df = test1_df.unionAll(test2_df)

# Training Set
train_images_df.show()

# Test Set
test_images_df.show()
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit
img_dir = "/home/ourui/deeplearning/images_classification-master/personalities"

#Read images and Create training & test DataFrames for transfer learning
jobs_df = ImageSchema.readImages(img_dir + "/jobs").withColumn("label", lit(1))
zuckerberg_df = ImageSchema.readImages(img_dir + "/zuckerberg").withColumn(
    "label", lit(0))
jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4])
zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4])
#dataframe for training a classification model
train_df = jobs_train.unionAll(zuckerberg_train)

#dataframe for testing the classification model
test_df = jobs_test.unionAll(zuckerberg_test)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
lr = LogisticRegression(maxIter=50,
                        regParam=0.005,
                        elasticNetParam=0.1,
                        labelCol="label")
p = Pipeline(stages=[featurizer, lr])
p_model = p.fit(train_df)
print("training model")
predictions = p_model.transform(test_df).select("image", "probability",
Beispiel #24
0
# rdd = op.sc.parallelize([Row(predicted_labels=['daisy', '0.8918145298957825']),
#                         Row(predicted_labels=['picket_fence', '0.14247830212116241']),
#                         Row(predicted_labels=['daisy', '0.9532104134559631'])])

# df_row = spark.createDataFrame(rdd)


def assert_spark_df(df):
    assert isinstance(df, pyspark.sql.dataframe.DataFrame), "Not a Spark DF"


def assert_spark_model(model):
    assert isinstance(model, pyspark.ml.PipelineModel), "Not a model"


tulips_df = ImageSchema.readImages("tests/testtulips/").withColumn(
    "label", lit(1))
daisy_df = imageIO.readImagesWithCustomFn(
    "tests/testdaisy/",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(0))

train_df = tulips_df.unionAll(daisy_df)


def test_image_classifier_lr():
    model, df_preds = op.dl.image_classifier_lr(train_df)

    assert_spark_model(model)
    assert_spark_df(df_preds)


def test_evaluate_img_lr():
Beispiel #25
0
# DBTITLE 1,Extract Training Images
# Extract Images
extractImagesSave(srcVideoPath, targetImgPath)

# Remove Empty Files
removeEmptyFiles(targetImgPath)

# View file list of images extracted from video
display(dbutils.fs.ls(targetImgPath))

# COMMAND ----------

# DBTITLE 1,Review Training Images
from pyspark.ml.image import ImageSchema

trainImages = ImageSchema.readImages(targetImgPath)
display(trainImages)

# COMMAND ----------

# DBTITLE 1,Feature Extraction using DeepImageFeaturizer
# MAGIC %md
# MAGIC Use [Spark Deep Learning Pipelines](https://github.com/databricks/spark-deep-learning) `DeepImageFeaturizer` to build image features via the InceptionV3 model

# COMMAND ----------


# DBTITLE 0,Save Features Function
# Save Image Features using
def saveImageFeatures(images, filePath):
    from sparkdl import DeepImageFeaturizer
            img_rescaled = resizeimage.resize_cover(new_im, [width, width])
            img_rescaled.save("{}/rescaled/{}".format(root, img))


if __name__ == "__main__":
    sc = SparkContext()
    img_dic = joblib.load("dictionary.pkl")[0]
    featurizer = DeepImageFeaturizer(inputCol="image",
                                     outputCol="features",
                                     modelName="InceptionV3")
    lr = LogisticRegressionModel.load('./lrModel')
    p_model = PipelineModel(stages=[featurizer, lr])

    directory = "./media"
    rescaled_dir = "{}/rescaled".format(directory)

    rescale_image(directory, rescaled_dir)

    temp_df = ImageSchema.readImages(rescaled_dir)
    df = p_model.transform(temp_df)
    f = open("predict_output.txt", "r+")
    f.seek(0)
    f.truncate()
    for i in df.select(['image', 'prediction']).collect():
        print("{} = {}".format(i[0][0].split('/')[-1], img_dic[int(i[1])]))
        f.write("{} = {}\n".format(i[0][0].split('/')[-1], img_dic[int(i[1])]))
    f.close()

    shutil.rmtree(rescaled_dir)

    # spark-submit --packages databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11 predict.py
# spark = SparkSession.builder.appName('SparkDeepLearning').getOrCreate()

# change configuration settings on Spark
# conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), (
#    'spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory', '8g')])

imageDir = "T://courses//BigData//data//flower_photos"

# Load images
# image_df = ImageSchema.readImages(imageDir, recursive = True).withColumn("label", lit(1))
# image_df.printSchema()
# image_df.show(5)
# train_df, test_df, _=image_df.randomSplit([0.1, 0.05, 0.85])

# read images using two methods
tulips_df = ImageSchema.readImages(imageDir + "/tulips").withColumn(
    "label", lit(1))
daisy_df = imageIO.readImagesWithCustomFn(
    imageDir + "/daisy",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(0))

# use larger training sets (e.g. [0.6, 0.4] for getting more images)
tulips_train, tulips_test, _ = tulips_df.randomSplit([0.1, 0.05, 0.85])
# use larger training sets (e.g. [0.6, 0.4] for getting more images)
daisy_train, daisy_test, _ = daisy_df.randomSplit([0.1, 0.05, 0.85])

train_df = tulips_train.unionAll(daisy_train)
test_df = tulips_test.unionAll(daisy_test)

# Under the hood, each of the partitions is fully loaded in memory, which may be expensive.
# This ensure that each of the paritions has a small size.
train_df = train_df.repartition(100)
Beispiel #28
0
# COMMAND ----------

# load image

# COMMAND ----------

display(dbutils.fs.ls("dbfs:/FileStore/tables"))

# COMMAND ----------

from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit
# Read images using Spark ... as a DataFrame.
# Each image is stored as a row in the imageSchema format.
image_cle = ImageSchema.readImages("dbfs:/FileStore/tables/cle/").withColumn(
    "label", lit(0))
image_cre = ImageSchema.readImages("dbfs:/FileStore/tables/cre/").withColumn(
    "label", lit(1))
image_ole = ImageSchema.readImages("dbfs:/FileStore/tables/ole/").withColumn(
    "label", lit(2))
image_ore = ImageSchema.readImages("dbfs:/FileStore/tables/ore/").withColumn(
    "label", lit(3))

# COMMAND ----------

image_cle.show(), image_cre.show(), image_ole.show(), image_ore.show()

# COMMAND ----------

type(image_cle)
Beispiel #29
0
# This would print all the files and directories
directory_list = []

for file in dirs:
   directory_list.append(file)
    
directory_list.pop(0)

base_image_dir = '/images/'

full_image_df = 0

for dir in directory_list:
  full_image_dir = base_image_dir + "/" + dir
  if full_image_df == 0:
    df = ImageSchema.readImages(full_image_dir)
    df = df.withColumn("image_label", lit(dir.lower()))
    full_image_df = df
  else:
    df = ImageSchema.readImages(full_image_dir)
    df = df.withColumn("image_label", lit(dir.lower()))
    full_image_df = full_image_df.union(df)

full_image_df.write.format("parquet").mode("overwrite").save("/images/full_image_df/")

# COMMAND ----------

display(full_image_df.where)

# COMMAND ----------
Beispiel #30
0
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

if __name__ == '__main__':

    sc = SparkContext(appName="Recognition App")

    output_directory = sc.broadcast(sys.argv[1])
    image_directory = sc.broadcast(sys.argv[2])

    # hdfs://pierre:41234/cs455/combined_images
    image_rdd = ImageSchema.readImages(image_directory.value).rdd
    image_rdd = image_rdd.repartition(20)
    image_rdd

    # this function will create a 128 feature embedding for each face, now we need to train a neural network on top of this
    def extract_embeddings(partition):
        face_cascade = cv2.CascadeClassifier(
            output_directory.value + '/haarcascade_frontalface_alt.xml')
        embedder = cv2.dnn.readNetFromTorch(output_directory.value +
                                            "/openface.nn4.small2.v1.t7")

        for old_image in partition:
            filename = old_image.image.origin.split('/')[-1]
            image = np.array(old_image.image.data)
            image = image.reshape((480, 854, 3))
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)