Beispiel #1
0
    def read_images(self):

        class_paths = {}
        class_paths_test = {}
        class_img_nums = {}

        dirlist = [
            item for item in os.listdir(self.img_dir)
            if os.path.isdir(os.path.join(self.img_dir, item))
        ]

        if os.path.isdir(self.test_dir) == True:
            shutil.rmtree(self.test_dir)

        for directory in dirlist:

            class_paths[directory] = os.path.join(self.img_dir, directory)
            num = self.images_num_sparkdl + self.images_num_sparkdl_run_best
            files = os.listdir(
                class_paths[directory])[self.images_num_sparkdl:num]

            for eachfilename in files:
                src = os.path.join(class_paths[directory], eachfilename)
                dst = os.path.join(self.test_dir, directory)
                if not os.path.exists(dst):
                    os.makedirs(dst)
                retrn_val = shutil.copy(src, os.path.join(dst, eachfilename))
            class_paths_test[directory] = os.path.join(dst)

        hdfs_path_run = os.path.join(self.hdfs_path, "test")
        exists = os.system("hadoop fs -test -d %s" % (hdfs_path_run))
        if exists == 0:
            exists = os.system("hadoop fs -rm -r -skipTrash %s" %
                               (hdfs_path_run))
        os.system("hadoop fs -copyFromLocal %s %s" %
                  (self.test_dir, self.hdfs_path))
        for directory in dirlist:
            class_paths_test[directory] = os.path.join(self.hdfs_path, "test",
                                                       directory)

        test_df = readImages(class_paths_test[dirlist[0]]).withColumn(
            "label", lit(dirlist[0]))
        for class_label in range(1, len(dirlist)):
            classi_df = readImages(
                class_paths_test[dirlist[class_label]]).withColumn(
                    "label", lit(dirlist[class_label]))
            test_df = test_df.unionAll(classi_df)

        return test_df
Beispiel #2
0
def score_inceptionV3(images_filepath):
    inceptionV3 = DeepImagePredictor(inputCol="image",
                                     outputCol="predicted_labels",
                                     modelName="InceptionV3",
                                     decodePredictions=True,
                                     topK=5)
    image_df = readImages(images_filepath)
    predictions = inceptionV3.transform(image_df)
    return predictions
def handler(message):
    records = message.collect()
    for record in records:
        print('record', record, type(record))
        print('-----------')
        print('tuple', record[0], record[1], type(record[0]), type(record[1]))
        # producer.send(output_topic, b'message received')
        key = record[0]
        value = record[1]
        if len(key) > 10:
            image_path = value
            image_DF = dl.readImages(image_path)
            image_DF.show()
            tested_lr_test = p_lr_test.transform(image_DF)
            # tested_lr_test.show()
            predict_value = tested_lr_test.select('prediction').head()[0] - 1
            print('predict', predict_value)
            print('byte predict', str(predict_value).encode('utf-8'))
            print('byte key', str(key).encode('utf-8'))
            producer.send(output_topic,
                          key=str(key).encode('utf-8'),
                          value=str(predict_value).encode('utf-8'))
            producer.flush()
            print('predict over')
        elif len(key) == 10:
            print('entered csv model part')
            modelloaded = DecisionTreeClassificationModel.load(
                "hdfs:///treemodelofcsv")
            NewInput = Row('Type', 'Age', 'Breed1', 'Breed2', 'Gender',
                           'Color1', 'Color2', 'Color3', 'MaturitySize',
                           'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized',
                           'Health', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt')
            value_lst = str(value).split(',')
            print('value_lst', value_lst)
            print('lst_len', len(value_lst))
            new_input = NewInput(int(value_lst[0]), int(value_lst[1]),
                                 int(value_lst[2]), int(value_lst[3]),
                                 int(value_lst[4]), int(value_lst[5]),
                                 int(value_lst[6]), int(value_lst[7]),
                                 int(value_lst[8]), int(value_lst[9]),
                                 int(value_lst[10]), int(value_lst[11]),
                                 int(value_lst[12]), int(value_lst[13]),
                                 int(value_lst[14]), int(value_lst[15]),
                                 int(value_lst[16]), value_lst[17])
            df_new_input = sql_sc.createDataFrame([new_input])
            df_new_input.show()
            df_new_input = pipeline.fit(df_new_input).transform(df_new_input)
            df_new_input = feature.transform(df_new_input)
            new_predict = modelloaded.transform(df_new_input)
            new_predict.show()
            predict_value = str(new_predict.select('prediction').head()[0])
            print('predict value', predict_value.encode('utf-8'))
            producer.send(output_topic,
                          key=str(key).encode('utf-8'),
                          value=predict_value.encode('utf-8'))
            producer.flush()
from sparkdl import readImages
img_dir = '/databricks-datasets/definitive-guide/data/deep-learning-images/'
image_df = readImages(img_dir)

# COMMAND ----------

image_df.printSchema()

# COMMAND ----------

from sparkdl import readImages
from pyspark.sql.functions import lit
tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1))
daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0))
tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4])
daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4])
train_df = tulips_train.unionAll(daisy_train)
test_df = tulips_test.unionAll(daisy_test)

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
lr = LogisticRegression(maxIter=1,
                        regParam=0.05,
                        elasticNetParam=0.3,
                        labelCol="label")
Beispiel #5
0
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#sc = SparkContext()
#spark = SparkSession(sc)

imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/"
#imageDir = "hdfs://192.168.65.188:8020/paih/"


def getFileName(filePath):
    fileName = os.path.basename(filePath).split(".")[0]
    return fileName


# Prepare Test Data
tmpTestDf = readImages(imageDir + "test5")
tmpTestRDD = tmpTestDf.rdd.map(
    lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0])))
tmptestX = tmpTestRDD.toDF()
csvTestTmp = spark.read.format("csv").option(
    "header", "true").load(imageDir + "test5.csv")
csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1])))
csvTest = csvTestRDD.toDF()
finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image,
                                   'inner').drop(csvTest.image)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
featureVector = featurizer.transform(finalTestDataFrame)
model_dr = OneVsRestModel.load(imageDir + 'model-dicision-tree-regression')
import os, sys, re
import glob
sys.path.extend(
    glob.glob(os.path.join(os.path.expanduser("~"), ".ivy2/jars/*.jar")))
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf, col, lit, monotonically_increasing_id
from pyspark.ml.classification import GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
from sparkdl import readImages

spark = SparkSession.builder.appName('alligator_training').getOrCreate()

img_alligator_df = readImages("/tmp/modeling/training/alligator").withColumn(
    "label", lit(1)).where(col("image").isNotNull())
img_other_df = readImages("/tmp/modeling/training/not_alligator").withColumn(
    "label", lit(0)).where(col("image").isNotNull())

#img_other_df.withColumn('uid',monotonically_increasing_id()).filter('uid < 10').count()
#img_other_df.show()

# Testing and Train Split (I'm using 40/60 because I was running out of memory when doing a higher training pct)
training_pct = 0.30
testing_pct = 0.70

alligator_train, alligator_test = img_alligator_df.randomSplit(
    [training_pct, testing_pct])
other_train, other_test = img_other_df.randomSplit([training_pct, testing_pct])

train_df = alligator_train.unionAll(other_train)
Beispiel #7
0
import sys,glob,os
sys.path.extend(glob.glob(os.path.join(os.path.expanduser("~"),".ivy2/jars/*.jar")))
 
### PySpark code to read images, create spark ml pipeline, train the mode & predict

from sparkdl import readImages
from pyspark.sql.functions import lit
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 
img_dir = "/user/arun/TrainingData"
 
cat1_df = readImages(img_dir + "/cat1").withColumn("label", lit(1))
cat2_df = readImages(img_dir + "/cat2").withColumn("label", lit(2))
cat3_df = readImages(img_dir + "/cat3").withColumn("label", lit(3))
cat4_df = readImages(img_dir + "/cat4").withColumn("label", lit(4))
cat5_df = readImages(img_dir + "/cat5").withColumn("label", lit(5))
 
//Split the images where 90% of them go to training data, 10% go to test data
 
cat1_train, cat1_test = cat1_df.randomSplit([0.9, 0.1])
cat2_train, cat2_test = cat2_df.randomSplit([0.9, 0.1])
cat3_train, cat3_test = cat3_df.randomSplit([0.9, 0.1])
cat4_train, cat4_test = cat4_df.randomSplit([0.9, 0.1])
cat5_train, cat5_test = cat5_df.randomSplit([0.9, 0.1])
 
train_df = cat1_train.unionAll(cat2_train).unionAll(cat3_train).unionAll(cat4_train).unionAll(cat5_train)
test_df = cat1_test.unionAll(cat2_test).unionAll(cat3_test).unionAll(cat4_test).unionAll(cat5_test)
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
import sparkdl as dl
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline, PipelineModel
conf = SparkConf().setAppName("image_testset").setMaster("yarn")
sc = SparkContext(conf=conf)
sql_sc = SQLContext(sc)

lr_test = LogisticRegressionModel.load('hdfs:///lr')
featurizer_test = dl.DeepImageFeaturizer(inputCol="image",
                                         outputCol="features",
                                         modelName="InceptionV3")
p_lr_test = PipelineModel(stages=[featurizer_test, lr_test])
image_path = "hdfs:///project_data/pets/test_images/"
image_DF = dl.readImages(image_path)
image_DF.show(10)
tested_lr_test = p_lr_test.transform(image_DF)
tested_lr_test.sample(False, 0.1).show()
Beispiel #9
0
from sparkdl import readImages
from pyspark.sql import Row
imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work"

def getFileName (filePath) :
	fileName = os.path.basename(filePath).split(".")[0]
	return fileName

# Prepare Train Data
tmpTrainDf = readImages(imageDir + "/train25")
tmpTrainRDD = tmpTrainDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0])))
tmpTrainX = tmpTrainRDD.toDF()
csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/train25.csv")
csvTrainRDD = csvTrainTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1])))
csvTrain = csvTrainRDD.toDF()
finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image)

# Prepare Test Data
tmpTestDf = readImages(imageDir + "/test5")
tmpTestRDD = tmpTestDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0])))
tmptestX = tmpTestRDD.toDF()
csvTestTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test5.csv")
csvTestRDD = csvTestTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1])))
csvTest = csvTestRDD.toDF()
finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image)

from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
    def prepare_data(self):

        images_num = self.data_params["images_num"]
        train_ratio = self.data_params["train_ratio"]
        test_ratio = self.data_params["test_ratio"]
        img_dir = self.data_params["img_dir"]
        run_dir = self.data_params["run_dir"]
        hdfs_path = self.data_params["hdfs_path"]

        class_paths = {}
        class_paths_run = {}
        class_img_nums = {}

        dirlist = [
            item for item in os.listdir(img_dir)
            if os.path.isdir(os.path.join(img_dir, item))
        ]

        if os.path.isdir(run_dir) == True:
            shutil.rmtree(run_dir)

        for directory in dirlist:
            class_paths[directory] = os.path.join(img_dir, directory)
            class_img_nums[directory] = len([
                name for name in os.listdir(class_paths[directory])
                if os.path.isfile(os.path.join(class_paths[directory], name))
            ])

            if images_num > class_img_nums[directory] or images_num == 0:
                files = os.listdir(class_paths[directory])
            else:
                files = os.listdir(class_paths[directory])[:images_num]

            for eachfilename in files:
                src = os.path.join(class_paths[directory], eachfilename)
                dst = os.path.join(run_dir, directory)
                if not os.path.exists(dst):
                    os.makedirs(dst)
                retrn_val = shutil.copy(src, os.path.join(dst, eachfilename))

        hdfs_path_run = os.path.join(hdfs_path, "run")
        exists = os.system("hadoop fs -test -d %s" % (hdfs_path_run))
        if exists == 0:
            exists = os.system("hadoop fs -rm -r -skipTrash %s" %
                               (hdfs_path_run))
        os.system("hadoop fs -copyFromLocal %s %s" % (run_dir, hdfs_path))

        for directory in dirlist:
            class_paths_run[directory] = os.path.join(hdfs_path, "run",
                                                      directory)

        class0_df = readImages(class_paths_run[dirlist[0]]).withColumn(
            "label", lit(0))
        train_df, test_df = class0_df.randomSplit([train_ratio, test_ratio],
                                                  seed=1234)
        for class_label in range(1, len(dirlist)):
            classi_df = readImages(
                class_paths_run[dirlist[class_label]]).withColumn(
                    "label", lit(class_label))
            classi_train, classi_test = classi_df.randomSplit(
                [train_ratio, test_ratio], seed=1234)
            train_df = train_df.unionAll(classi_train)
            test_df = test_df.unionAll(classi_test)

        return train_df, test_df
Beispiel #11
0
 def read_imageDir(img_dir):
     original_df = readImages(img_dir + "/jobs").withColumn("label", lit(1))
     otsu_df = readImages(img_dir + "/zuckerberg").withColumn(
         "label", lit(0))
     return original_df, otsu_df
Beispiel #12
0
from pyspark.sql.window import Window
form pyspark.sql.functions import row_number
from sparkdl import readImages
csv = spark.read.format("csv").option("header", "true").load("PATH/TO/CSV").select('level').limit(100).withColumn("rowNumber"), row_number().over(Window.partitionBy("level").orderBy("level")))
retinopathyDf = readImages("PATH/TO/IMAGES").limit(100).withColumn("rowNumber", row_number().over(Window.partitionBy("filepath").orderBy("image")))
fianlDataFrame = retinopathyDf.join(csv, retinopathyDf.rowNumber == csv.rowNumber, 'inner').drop(csv.rowNumber)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.import Pipeline
from sparkdl import DeepImageFeaturizer
featurizer = DeepImagerFeaturizer(inputCol = "image", outputCol="features", modelName="inceptionV3")
lr = LogsticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="level")
p = Pipeline(stages=[featurizer, lr])

from pyspark.sql.functions import col, udf, substring
from pyspark.sql.types import DoubleType, StringType

from pyspark.sql import Row

#sc = init_nncontext("sparkdl")
conf = SparkConf().setAppName("sparkdl").setMaster("yarn")
sc = SparkContext(conf=conf)

image_path = "hdfs:///project_data/pets/train_images/"
csv_path = "hdfs:///project_data/pets/train/train.csv"
sql_sc = SQLContext(sc)
csv_df = sql_sc.read.format("csv").option("header","true").load(csv_path)
csv_df.printSchema()
image_DF = dl.readImages(image_path).withColumn("id",substring("filePath",50,9))
image_DF.printSchema()
image_DF.show(10)
labelDF = image_DF.join(csv_df, image_DF.id == csv_df.PetID, "left").withColumn("label",col("AdoptionSpeed").cast("double")+1).select("image","label")
#labelDF.count()
labelDF = labelDF.na.drop().limit(2000)
#labelDF.count()

(trainingDF, validationDF) = labelDF.randomSplit([0.7, 0.3])
trainingDF.show(10)
print("show over")
vectorizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3')
logreg = LogisticRegression(maxIter=30,regParam=0.05, elasticNetParam=0.3, labelCol = "label", featuresCol="features")
pipeline = Pipeline(stages=[vectorizer, logreg])

pipeline_model = pipeline.fit(trainingDF)
Beispiel #14
0
# Notebook to make model, originally ran on Databricks
# Import libraries

from sparkdl import readImages
from pyspark.sql.functions import lit

# make the images dataframes

path = "/CS4301/normalize/"

food_df = readImages(path + "food").withColumn("label", lit(0))
inside_df = readImages(path + "inside").withColumn("label", lit(1))
outside_df = readImages(path + "outside").withColumn("label", lit(2))
menu_df = readImages(path + "menu").withColumn("label", lit(3))
drink_df = readImages(path + "drink").withColumn("label", lit(4))

# Make the training and testing dataset

f_train, f_test = food_df.randomSplit([0.8, 0.2])
i_train, i_test = inside_df.randomSplit([0.8, 0.2])
o_train, o_test = outside_df.randomSplit([0.8, 0.2])
m_train, m_test = menu_df.randomSplit([0.8, 0.2])
d_train, d_test = drink_df.randomSplit([0.8, 0.2])

# Union the datasets
# Function used to union several dataframes

from pyspark.sql import DataFrame


def unionAll(*dfs):
Beispiel #15
0
# ./bin/pyspark --packages databricks:spark-deep-learning:0.1.0-spark2.1-s_2.11 --driver-memory 5g
from sparkdl import readImages
from pyspark.sql.functions import lit
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

img_dir = "/home/gayatri/Documents/Spark/DeepLearning/personalities/"

#Read images and Create training & test DataFrames for transfer learning
person1_df = readImages(img_dir + "/person1").withColumn("label", lit(1))
person2_df = readImages(img_dir + "/person2").withColumn("label", lit(0))
person1_train, person1_test = person1_df.randomSplit([0.6, 0.4])
person2_train, person2_test = person2_df.randomSplit([0.6, 0.4])

#dataframe for training a classification model
train_df = person1_train.unionAll(person2_train)

#dataframe for testing the classification model
test_df = person1_test.unionAll(person2_test)


featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])
p_model = p.fit(train_df)

predictions = p_model.transform(test_df)

predictions.select("filePath", "prediction").show(truncate=False)
Beispiel #16
0
# image import
from pyspark.sql import SparkSession
from pyspark import SparkContext

sc = SparkContext()
spark = SparkSession(sc)

from sparkdl import readImages
from pyspark.sql.functions import lit

img_dir = "hdfs:///flower-classify/flowers"

#Read images and Create training & test DataFrames for transfer learning
daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0))
dandelion_df = readImages(img_dir + "/dandelion").withColumn("label", lit(1))
roses_df = readImages(img_dir + "/roses").withColumn("label", lit(2))
sunflowers_df = readImages(img_dir + "/sunflowers").withColumn("label", lit(3))
tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(4))

#dataframe for training a classification model
train_df = daisy_train.unionAll(dandelion_train).unionAll(
    roses_train).unionAll(sunflowers_train).unionAll(tulips_train)
print("TRAIN DF PREPARED.")

#dataframe for testing the classification model
test_df = daisy_test.unionAll(dandelion_test).unionAll(roses_test).unionAll(
    sunflowers_test).unionAll(tulips_test)

# model creation

from pyspark.ml.classification import LogisticRegression
Beispiel #17
0
import tensorflow as tf
import pyspark.sql.functions as f
import sparkdl as dl
from pyspark.sql.functions import lit
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

#root directory
img_dir = "filtered_dataset/"

magnifications = ['40', '100', '200', '400']

for m in magnifications:
    b_df = dl.readImages(img_dir + "/b" + m).withColumn("label", lit(1))
    m_df = dl.readImages(img_dir + "/m" + m).withColumn("label", lit(0))

    #Splitting the data into training and test in the ratio 80% & 20%
    trainb, testb = b_df.randomSplit([80.00, 20.00], seed=42)
    trainm, testm = m_df.randomSplit([80.00, 20.00], seed=42)

    #combining the dataset benign and malignanent for the training and testing
    trainDF = trainb.unionAll(trainm)
    testDF = testb.unionAll(testm)

    vectorizer = dl.DeepImageFeaturizer(inputCol="image",
                                        outputCol="features",
                                        modelName='InceptionV3')
    logreg = LogisticRegression(maxIter=10,
                                regParam=0.01,
                                elasticNetParam=0.1,
                                labelCol="label",
Beispiel #18
0
from pyspark.ml.classification import OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from sparkdl import readImages
from pyspark.sql import Row
from sparkdl import DeepImageFeaturizer
from pyspark.ml.classification import OneVsRestModel
from pyspark.sql.functions import lit

# Prepare Test Data

imageDir = imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/dst/resized/"

labelZeroDf = readImages(imageDir + "tl0").withColumn("label", lit(0))
labelOneDf = readImages(imageDir + "tl1").withColumn("label", lit(1))
labelTwoDf = readImages(imageDir + "tl2").withColumn("label", lit(2))
labelThreeDf = readImages(imageDir + "tl3").withColumn("label", lit(3))
labelFourDf = readImages(imageDir + "tl4").withColumn("label", lit(4))
finalTestDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll(
    labelThreeDf).unionAll(labelFourDf)

testSize = finalTestDf.count()
print(str(testSize))
featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
featureVector = featurizer.transform(finalTestDf)
model_dc = OneVsRestModel.load(imageDir + 'model-decision-tree-classifier-new')

predictions = model_dc.transform(featureVector)
predictions.show()
Beispiel #19
0
from sparkdl import readImages
from pyspark.sql.functions import lit

#define data directory
imgDir = "/home/mvk/image-classification/sushant"

#read Images for training data without diabetic-retinopathy as negDf
negDf = readImages(imgDir + "/training/negative").withColumn("label", lit(0))

#read Images for training data with diabetic-retinopathy as posDf
posDf = readImages(imgDir + "/training/positive").withColumn("label", lit(1))

#read Images for test data without diabetic-retinopathy as testNegDf
testNegDf = readImages(imgDir + "/test/negative").withColumn("label", lit(0))

#read Images for test data with diabetic-retinopathy as testPosDf
testPosDf = readImages(imgDir + "/test/positive").withColumn("label", lit(1))

#prepare complete training data by combining negDf and posDf
trainDf = negDf.unionAll(posDf)

#prepare complete test data by combining testNegDf and testPosDf
testDf = testNegDf.unionAll(testPosDf)

#various required imports
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

#use of DeepImageFeaturizer for extracting features for image classification
#for provided input images
Beispiel #20
0
from sparkdl import readImages
from pyspark.sql.functions import lit
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer
import org.apache.spark.sql
import org.apache.spark.sql._


img_dir = "/home/mvk/images_classification-master/personalities"

jobs_df = readImages(img_dir + "/jobs").withColumn("label", lit(1))
zuckerberg_df = readImages(img_dir + "/zuckerberg").withColumn("label", lit(0))
jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4])
zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4])

train_df = jobs_train.unionAll(zuckerberg_train)
test_df = jobs_test.unionAll(zuckerberg_test)

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])
p_model = p.fit(train_df)
predictions = p_model.transform(test_df)

predictions.select("filePath", "prediction").show(truncate=False)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
df = p_model.transform(test_df)
df.show()

predictionAndLabels = df.select("prediction", "label")
Beispiel #21
0
from sparkdl import DeepImageFeaturizer

sc = SparkContext()
spark = SparkSession(sc)

#imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work"
imageDir = "hdfs://192.168.65.188:8020/paih/"
#imageDir = "hdfs://10.0.0.7:8020/paih/"


def getFileName (filePath) :
	fileName = os.path.basename(filePath).split(".")[0]
	return fileName

# Prepare Train Data
tmpTrainDf = readImages(imageDir + "/train25")
#tmpTrainDf = readImages(imageDir + "/test1")
tmpTrainRDD = tmpTrainDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0])))
tmpTrainX = tmpTrainRDD.toDF()
csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/train25.csv")
#csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv")
csvTrainRDD = csvTrainTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1])))
csvTrain = csvTrainRDD.toDF()
finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image)


featurizer = DeepImageFeaturizer(inputCol="image",outputCol="features", modelName="InceptionV3")

method = LogisticRegression(maxIter=50, regParam=0.05, elasticNetParam=0.3, labelCol="label")
ovr = OneVsRest(classifier = method)
featureVector = featurizer.transform(finalTrainDataFrame).persist()
Beispiel #22
0
#dataframe for testing the classification model
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer

spark = SparkSession.builder.appName('Weather Image Classifier - Data Analysis').getOrCreate()

assert sys.version_info >= (3, 4) # make sure we have Python 3.4+
assert spark.version >= '2.2' # make sure we have Spark 2.2+

from sparkdl import readImages

img_dir = "katkam-scaled"

#Read images and Create training & test DataFrames for transfer learning
jobs_df = readImages(img_dir)
jobs_df.show()
df = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3").transform(jobs_df)
df.show()
i = 2



df = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df).transform(df)
jobs_train, jobs_test = df.randomSplit([0.6, 0.4])

lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p_model = lr.fit(jobs_train)
predictions = p_model.transform(jobs_test)

predictions.select("filePath", "prediction").show(truncate=False)
# pyspark --packages databricks:spark-deep-learning:0.1.0-spark2.1-s_2.11
# https://github.com/databricks/spark-deep-learning/issues/18
# from sparkdl import readImages
import sparkdl
image_df = sparkdl.readImages(
    "/Users/502677522/Class/DataWeekend/2017_12_02/data/test/1.jpg")
image_df.show()

from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.image import img_to_array, load_img
import numpy as np
import os
from sparkdl import KerasImageFileTransformer


def loadAndPreprocessKerasInceptionV3(uri):
    # this is a typical way to load and prep images in keras
    image = img_to_array(load_img(uri, target_size=(299, 299)))
    image = np.expand_dims(image, axis=0)
    return preprocess_input(image)


uri = "/Users/502677522/Class/DataWeekend/2017_12_02/data/test/1.jpg"
dat = loadAndPreprocessKerasInceptionV3(uri)

# >> > dat.shape
# (1, 299, 299, 3)
Beispiel #24
0
#sc = SparkContext()
#spark = SparkSession(sc)

imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/"
#imageDir = "hdfs://192.168.65.188:8020/paih/"


def getFileName(filePath):
    fileName = os.path.basename(filePath).split(".")[0]
    return fileName


# Prepare Test Data
# Prepare Test Data
tmpTestDf = readImages(imageDir + "train25_2")
tmpTestRDD = tmpTestDf.rdd.map(
    lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0])))
tmptestX = tmpTestRDD.toDF()
csvTestTmp = spark.read.format("csv").option(
    "header", "true").load(imageDir + "train25_2.csv")
csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1])))
csvTest = csvTestRDD.toDF()
finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image,
                                   'inner').drop(csvTest.image)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
featureVector = featurizer.transform(finalTestDataFrame)
del tmpTestDf
Beispiel #25
0
# Databricks notebook source
# MAGIC %md
# MAGIC ![Architecture](https://s3-us-west-1.amazonaws.com/databricks-binu-mathew/image/deep_learning/jets.png)

# COMMAND ----------

# MAGIC %md
# MAGIC ![Architecture](https://s3-us-west-1.amazonaws.com/databricks-binu-mathew/image/deep_learning/11.png)

# COMMAND ----------

from sparkdl import readImages
from pyspark.sql.functions import lit

img_dir = '/tmp/demo/planes'
passenger_jets_train_df = readImages(img_dir + "/passenger_jets").withColumn(
    "label", lit(1))
fighter_jets_train_df = readImages(img_dir + "/fighter_jets").withColumn(
    "label", lit(0))

#dataframe for training a classification model
train_df = passenger_jets_train_df.unionAll(fighter_jets_train_df)

#dataframe for testing the classification model
test_df = readImages('/bmathew/test_data')

# COMMAND ----------

import sys
sys.stdout.isatty = lambda: False
sys.stdout.encoding = 'utf-8'