def read_images(self): class_paths = {} class_paths_test = {} class_img_nums = {} dirlist = [ item for item in os.listdir(self.img_dir) if os.path.isdir(os.path.join(self.img_dir, item)) ] if os.path.isdir(self.test_dir) == True: shutil.rmtree(self.test_dir) for directory in dirlist: class_paths[directory] = os.path.join(self.img_dir, directory) num = self.images_num_sparkdl + self.images_num_sparkdl_run_best files = os.listdir( class_paths[directory])[self.images_num_sparkdl:num] for eachfilename in files: src = os.path.join(class_paths[directory], eachfilename) dst = os.path.join(self.test_dir, directory) if not os.path.exists(dst): os.makedirs(dst) retrn_val = shutil.copy(src, os.path.join(dst, eachfilename)) class_paths_test[directory] = os.path.join(dst) hdfs_path_run = os.path.join(self.hdfs_path, "test") exists = os.system("hadoop fs -test -d %s" % (hdfs_path_run)) if exists == 0: exists = os.system("hadoop fs -rm -r -skipTrash %s" % (hdfs_path_run)) os.system("hadoop fs -copyFromLocal %s %s" % (self.test_dir, self.hdfs_path)) for directory in dirlist: class_paths_test[directory] = os.path.join(self.hdfs_path, "test", directory) test_df = readImages(class_paths_test[dirlist[0]]).withColumn( "label", lit(dirlist[0])) for class_label in range(1, len(dirlist)): classi_df = readImages( class_paths_test[dirlist[class_label]]).withColumn( "label", lit(dirlist[class_label])) test_df = test_df.unionAll(classi_df) return test_df
def score_inceptionV3(images_filepath): inceptionV3 = DeepImagePredictor(inputCol="image", outputCol="predicted_labels", modelName="InceptionV3", decodePredictions=True, topK=5) image_df = readImages(images_filepath) predictions = inceptionV3.transform(image_df) return predictions
def handler(message): records = message.collect() for record in records: print('record', record, type(record)) print('-----------') print('tuple', record[0], record[1], type(record[0]), type(record[1])) # producer.send(output_topic, b'message received') key = record[0] value = record[1] if len(key) > 10: image_path = value image_DF = dl.readImages(image_path) image_DF.show() tested_lr_test = p_lr_test.transform(image_DF) # tested_lr_test.show() predict_value = tested_lr_test.select('prediction').head()[0] - 1 print('predict', predict_value) print('byte predict', str(predict_value).encode('utf-8')) print('byte key', str(key).encode('utf-8')) producer.send(output_topic, key=str(key).encode('utf-8'), value=str(predict_value).encode('utf-8')) producer.flush() print('predict over') elif len(key) == 10: print('entered csv model part') modelloaded = DecisionTreeClassificationModel.load( "hdfs:///treemodelofcsv") NewInput = Row('Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt') value_lst = str(value).split(',') print('value_lst', value_lst) print('lst_len', len(value_lst)) new_input = NewInput(int(value_lst[0]), int(value_lst[1]), int(value_lst[2]), int(value_lst[3]), int(value_lst[4]), int(value_lst[5]), int(value_lst[6]), int(value_lst[7]), int(value_lst[8]), int(value_lst[9]), int(value_lst[10]), int(value_lst[11]), int(value_lst[12]), int(value_lst[13]), int(value_lst[14]), int(value_lst[15]), int(value_lst[16]), value_lst[17]) df_new_input = sql_sc.createDataFrame([new_input]) df_new_input.show() df_new_input = pipeline.fit(df_new_input).transform(df_new_input) df_new_input = feature.transform(df_new_input) new_predict = modelloaded.transform(df_new_input) new_predict.show() predict_value = str(new_predict.select('prediction').head()[0]) print('predict value', predict_value.encode('utf-8')) producer.send(output_topic, key=str(key).encode('utf-8'), value=predict_value.encode('utf-8')) producer.flush()
from sparkdl import readImages img_dir = '/databricks-datasets/definitive-guide/data/deep-learning-images/' image_df = readImages(img_dir) # COMMAND ---------- image_df.printSchema() # COMMAND ---------- from sparkdl import readImages from pyspark.sql.functions import lit tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(1)) daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0)) tulips_train, tulips_test = tulips_df.randomSplit([0.6, 0.4]) daisy_train, daisy_test = daisy_df.randomSplit([0.6, 0.4]) train_df = tulips_train.unionAll(daisy_train) test_df = tulips_test.unionAll(daisy_test) # COMMAND ---------- from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=1, regParam=0.05, elasticNetParam=0.3, labelCol="label")
from pyspark.ml.evaluation import MulticlassClassificationEvaluator #sc = SparkContext() #spark = SparkSession(sc) imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/" #imageDir = "hdfs://192.168.65.188:8020/paih/" def getFileName(filePath): fileName = os.path.basename(filePath).split(".")[0] return fileName # Prepare Test Data tmpTestDf = readImages(imageDir + "test5") tmpTestRDD = tmpTestDf.rdd.map( lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0]))) tmptestX = tmpTestRDD.toDF() csvTestTmp = spark.read.format("csv").option( "header", "true").load(imageDir + "test5.csv") csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1]))) csvTest = csvTestRDD.toDF() finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDataFrame) model_dr = OneVsRestModel.load(imageDir + 'model-dicision-tree-regression')
import os, sys, re import glob sys.path.extend( glob.glob(os.path.join(os.path.expanduser("~"), ".ivy2/jars/*.jar"))) from pyspark.sql import SparkSession from pyspark.sql.types import * from pyspark.sql.functions import udf, col, lit, monotonically_increasing_id from pyspark.ml.classification import GBTClassifier, LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from sparkdl import readImages spark = SparkSession.builder.appName('alligator_training').getOrCreate() img_alligator_df = readImages("/tmp/modeling/training/alligator").withColumn( "label", lit(1)).where(col("image").isNotNull()) img_other_df = readImages("/tmp/modeling/training/not_alligator").withColumn( "label", lit(0)).where(col("image").isNotNull()) #img_other_df.withColumn('uid',monotonically_increasing_id()).filter('uid < 10').count() #img_other_df.show() # Testing and Train Split (I'm using 40/60 because I was running out of memory when doing a higher training pct) training_pct = 0.30 testing_pct = 0.70 alligator_train, alligator_test = img_alligator_df.randomSplit( [training_pct, testing_pct]) other_train, other_test = img_other_df.randomSplit([training_pct, testing_pct]) train_df = alligator_train.unionAll(other_train)
import sys,glob,os sys.path.extend(glob.glob(os.path.join(os.path.expanduser("~"),".ivy2/jars/*.jar"))) ### PySpark code to read images, create spark ml pipeline, train the mode & predict from sparkdl import readImages from pyspark.sql.functions import lit from pyspark.ml.classification import RandomForestClassifier from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator img_dir = "/user/arun/TrainingData" cat1_df = readImages(img_dir + "/cat1").withColumn("label", lit(1)) cat2_df = readImages(img_dir + "/cat2").withColumn("label", lit(2)) cat3_df = readImages(img_dir + "/cat3").withColumn("label", lit(3)) cat4_df = readImages(img_dir + "/cat4").withColumn("label", lit(4)) cat5_df = readImages(img_dir + "/cat5").withColumn("label", lit(5)) //Split the images where 90% of them go to training data, 10% go to test data cat1_train, cat1_test = cat1_df.randomSplit([0.9, 0.1]) cat2_train, cat2_test = cat2_df.randomSplit([0.9, 0.1]) cat3_train, cat3_test = cat3_df.randomSplit([0.9, 0.1]) cat4_train, cat4_test = cat4_df.randomSplit([0.9, 0.1]) cat5_train, cat5_test = cat5_df.randomSplit([0.9, 0.1]) train_df = cat1_train.unionAll(cat2_train).unionAll(cat3_train).unionAll(cat4_train).unionAll(cat5_train) test_df = cat1_test.unionAll(cat2_test).unionAll(cat3_test).unionAll(cat4_test).unionAll(cat5_test)
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext import sparkdl as dl from pyspark.ml.classification import LogisticRegressionModel from pyspark.ml import Pipeline, PipelineModel conf = SparkConf().setAppName("image_testset").setMaster("yarn") sc = SparkContext(conf=conf) sql_sc = SQLContext(sc) lr_test = LogisticRegressionModel.load('hdfs:///lr') featurizer_test = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") p_lr_test = PipelineModel(stages=[featurizer_test, lr_test]) image_path = "hdfs:///project_data/pets/test_images/" image_DF = dl.readImages(image_path) image_DF.show(10) tested_lr_test = p_lr_test.transform(image_DF) tested_lr_test.sample(False, 0.1).show()
from sparkdl import readImages from pyspark.sql import Row imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work" def getFileName (filePath) : fileName = os.path.basename(filePath).split(".")[0] return fileName # Prepare Train Data tmpTrainDf = readImages(imageDir + "/train25") tmpTrainRDD = tmpTrainDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0]))) tmpTrainX = tmpTrainRDD.toDF() csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/train25.csv") csvTrainRDD = csvTrainTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1]))) csvTrain = csvTrainRDD.toDF() finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image) # Prepare Test Data tmpTestDf = readImages(imageDir + "/test5") tmpTestRDD = tmpTestDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0]))) tmptestX = tmpTestRDD.toDF() csvTestTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test5.csv") csvTestRDD = csvTestTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1]))) csvTest = csvTestRDD.toDF() finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image) from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.sql import SparkSession
def prepare_data(self): images_num = self.data_params["images_num"] train_ratio = self.data_params["train_ratio"] test_ratio = self.data_params["test_ratio"] img_dir = self.data_params["img_dir"] run_dir = self.data_params["run_dir"] hdfs_path = self.data_params["hdfs_path"] class_paths = {} class_paths_run = {} class_img_nums = {} dirlist = [ item for item in os.listdir(img_dir) if os.path.isdir(os.path.join(img_dir, item)) ] if os.path.isdir(run_dir) == True: shutil.rmtree(run_dir) for directory in dirlist: class_paths[directory] = os.path.join(img_dir, directory) class_img_nums[directory] = len([ name for name in os.listdir(class_paths[directory]) if os.path.isfile(os.path.join(class_paths[directory], name)) ]) if images_num > class_img_nums[directory] or images_num == 0: files = os.listdir(class_paths[directory]) else: files = os.listdir(class_paths[directory])[:images_num] for eachfilename in files: src = os.path.join(class_paths[directory], eachfilename) dst = os.path.join(run_dir, directory) if not os.path.exists(dst): os.makedirs(dst) retrn_val = shutil.copy(src, os.path.join(dst, eachfilename)) hdfs_path_run = os.path.join(hdfs_path, "run") exists = os.system("hadoop fs -test -d %s" % (hdfs_path_run)) if exists == 0: exists = os.system("hadoop fs -rm -r -skipTrash %s" % (hdfs_path_run)) os.system("hadoop fs -copyFromLocal %s %s" % (run_dir, hdfs_path)) for directory in dirlist: class_paths_run[directory] = os.path.join(hdfs_path, "run", directory) class0_df = readImages(class_paths_run[dirlist[0]]).withColumn( "label", lit(0)) train_df, test_df = class0_df.randomSplit([train_ratio, test_ratio], seed=1234) for class_label in range(1, len(dirlist)): classi_df = readImages( class_paths_run[dirlist[class_label]]).withColumn( "label", lit(class_label)) classi_train, classi_test = classi_df.randomSplit( [train_ratio, test_ratio], seed=1234) train_df = train_df.unionAll(classi_train) test_df = test_df.unionAll(classi_test) return train_df, test_df
def read_imageDir(img_dir): original_df = readImages(img_dir + "/jobs").withColumn("label", lit(1)) otsu_df = readImages(img_dir + "/zuckerberg").withColumn( "label", lit(0)) return original_df, otsu_df
from pyspark.sql.window import Window form pyspark.sql.functions import row_number from sparkdl import readImages csv = spark.read.format("csv").option("header", "true").load("PATH/TO/CSV").select('level').limit(100).withColumn("rowNumber"), row_number().over(Window.partitionBy("level").orderBy("level"))) retinopathyDf = readImages("PATH/TO/IMAGES").limit(100).withColumn("rowNumber", row_number().over(Window.partitionBy("filepath").orderBy("image"))) fianlDataFrame = retinopathyDf.join(csv, retinopathyDf.rowNumber == csv.rowNumber, 'inner').drop(csv.rowNumber) from pyspark.ml.classification import LogisticRegression from pyspark.ml.import Pipeline from sparkdl import DeepImageFeaturizer featurizer = DeepImagerFeaturizer(inputCol = "image", outputCol="features", modelName="inceptionV3") lr = LogsticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="level") p = Pipeline(stages=[featurizer, lr])
from pyspark.sql.functions import col, udf, substring from pyspark.sql.types import DoubleType, StringType from pyspark.sql import Row #sc = init_nncontext("sparkdl") conf = SparkConf().setAppName("sparkdl").setMaster("yarn") sc = SparkContext(conf=conf) image_path = "hdfs:///project_data/pets/train_images/" csv_path = "hdfs:///project_data/pets/train/train.csv" sql_sc = SQLContext(sc) csv_df = sql_sc.read.format("csv").option("header","true").load(csv_path) csv_df.printSchema() image_DF = dl.readImages(image_path).withColumn("id",substring("filePath",50,9)) image_DF.printSchema() image_DF.show(10) labelDF = image_DF.join(csv_df, image_DF.id == csv_df.PetID, "left").withColumn("label",col("AdoptionSpeed").cast("double")+1).select("image","label") #labelDF.count() labelDF = labelDF.na.drop().limit(2000) #labelDF.count() (trainingDF, validationDF) = labelDF.randomSplit([0.7, 0.3]) trainingDF.show(10) print("show over") vectorizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3') logreg = LogisticRegression(maxIter=30,regParam=0.05, elasticNetParam=0.3, labelCol = "label", featuresCol="features") pipeline = Pipeline(stages=[vectorizer, logreg]) pipeline_model = pipeline.fit(trainingDF)
# Notebook to make model, originally ran on Databricks # Import libraries from sparkdl import readImages from pyspark.sql.functions import lit # make the images dataframes path = "/CS4301/normalize/" food_df = readImages(path + "food").withColumn("label", lit(0)) inside_df = readImages(path + "inside").withColumn("label", lit(1)) outside_df = readImages(path + "outside").withColumn("label", lit(2)) menu_df = readImages(path + "menu").withColumn("label", lit(3)) drink_df = readImages(path + "drink").withColumn("label", lit(4)) # Make the training and testing dataset f_train, f_test = food_df.randomSplit([0.8, 0.2]) i_train, i_test = inside_df.randomSplit([0.8, 0.2]) o_train, o_test = outside_df.randomSplit([0.8, 0.2]) m_train, m_test = menu_df.randomSplit([0.8, 0.2]) d_train, d_test = drink_df.randomSplit([0.8, 0.2]) # Union the datasets # Function used to union several dataframes from pyspark.sql import DataFrame def unionAll(*dfs):
# ./bin/pyspark --packages databricks:spark-deep-learning:0.1.0-spark2.1-s_2.11 --driver-memory 5g from sparkdl import readImages from pyspark.sql.functions import lit from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator img_dir = "/home/gayatri/Documents/Spark/DeepLearning/personalities/" #Read images and Create training & test DataFrames for transfer learning person1_df = readImages(img_dir + "/person1").withColumn("label", lit(1)) person2_df = readImages(img_dir + "/person2").withColumn("label", lit(0)) person1_train, person1_test = person1_df.randomSplit([0.6, 0.4]) person2_train, person2_test = person2_df.randomSplit([0.6, 0.4]) #dataframe for training a classification model train_df = person1_train.unionAll(person2_train) #dataframe for testing the classification model test_df = person1_test.unionAll(person2_test) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") p = Pipeline(stages=[featurizer, lr]) p_model = p.fit(train_df) predictions = p_model.transform(test_df) predictions.select("filePath", "prediction").show(truncate=False)
# image import from pyspark.sql import SparkSession from pyspark import SparkContext sc = SparkContext() spark = SparkSession(sc) from sparkdl import readImages from pyspark.sql.functions import lit img_dir = "hdfs:///flower-classify/flowers" #Read images and Create training & test DataFrames for transfer learning daisy_df = readImages(img_dir + "/daisy").withColumn("label", lit(0)) dandelion_df = readImages(img_dir + "/dandelion").withColumn("label", lit(1)) roses_df = readImages(img_dir + "/roses").withColumn("label", lit(2)) sunflowers_df = readImages(img_dir + "/sunflowers").withColumn("label", lit(3)) tulips_df = readImages(img_dir + "/tulips").withColumn("label", lit(4)) #dataframe for training a classification model train_df = daisy_train.unionAll(dandelion_train).unionAll( roses_train).unionAll(sunflowers_train).unionAll(tulips_train) print("TRAIN DF PREPARED.") #dataframe for testing the classification model test_df = daisy_test.unionAll(dandelion_test).unionAll(roses_test).unionAll( sunflowers_test).unionAll(tulips_test) # model creation from pyspark.ml.classification import LogisticRegression
import tensorflow as tf import pyspark.sql.functions as f import sparkdl as dl from pyspark.sql.functions import lit from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline #root directory img_dir = "filtered_dataset/" magnifications = ['40', '100', '200', '400'] for m in magnifications: b_df = dl.readImages(img_dir + "/b" + m).withColumn("label", lit(1)) m_df = dl.readImages(img_dir + "/m" + m).withColumn("label", lit(0)) #Splitting the data into training and test in the ratio 80% & 20% trainb, testb = b_df.randomSplit([80.00, 20.00], seed=42) trainm, testm = m_df.randomSplit([80.00, 20.00], seed=42) #combining the dataset benign and malignanent for the training and testing trainDF = trainb.unionAll(trainm) testDF = testb.unionAll(testm) vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3') logreg = LogisticRegression(maxIter=10, regParam=0.01, elasticNetParam=0.1, labelCol="label",
from pyspark.ml.classification import OneVsRest from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.mllib.evaluation import MulticlassMetrics from sparkdl import readImages from pyspark.sql import Row from sparkdl import DeepImageFeaturizer from pyspark.ml.classification import OneVsRestModel from pyspark.sql.functions import lit # Prepare Test Data imageDir = imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/dst/resized/" labelZeroDf = readImages(imageDir + "tl0").withColumn("label", lit(0)) labelOneDf = readImages(imageDir + "tl1").withColumn("label", lit(1)) labelTwoDf = readImages(imageDir + "tl2").withColumn("label", lit(2)) labelThreeDf = readImages(imageDir + "tl3").withColumn("label", lit(3)) labelFourDf = readImages(imageDir + "tl4").withColumn("label", lit(4)) finalTestDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll( labelThreeDf).unionAll(labelFourDf) testSize = finalTestDf.count() print(str(testSize)) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDf) model_dc = OneVsRestModel.load(imageDir + 'model-decision-tree-classifier-new') predictions = model_dc.transform(featureVector) predictions.show()
from sparkdl import readImages from pyspark.sql.functions import lit #define data directory imgDir = "/home/mvk/image-classification/sushant" #read Images for training data without diabetic-retinopathy as negDf negDf = readImages(imgDir + "/training/negative").withColumn("label", lit(0)) #read Images for training data with diabetic-retinopathy as posDf posDf = readImages(imgDir + "/training/positive").withColumn("label", lit(1)) #read Images for test data without diabetic-retinopathy as testNegDf testNegDf = readImages(imgDir + "/test/negative").withColumn("label", lit(0)) #read Images for test data with diabetic-retinopathy as testPosDf testPosDf = readImages(imgDir + "/test/positive").withColumn("label", lit(1)) #prepare complete training data by combining negDf and posDf trainDf = negDf.unionAll(posDf) #prepare complete test data by combining testNegDf and testPosDf testDf = testNegDf.unionAll(testPosDf) #various required imports from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer #use of DeepImageFeaturizer for extracting features for image classification #for provided input images
from sparkdl import readImages from pyspark.sql.functions import lit from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer import org.apache.spark.sql import org.apache.spark.sql._ img_dir = "/home/mvk/images_classification-master/personalities" jobs_df = readImages(img_dir + "/jobs").withColumn("label", lit(1)) zuckerberg_df = readImages(img_dir + "/zuckerberg").withColumn("label", lit(0)) jobs_train, jobs_test = jobs_df.randomSplit([0.6, 0.4]) zuckerberg_train, zuckerberg_test = zuckerberg_df.randomSplit([0.6, 0.4]) train_df = jobs_train.unionAll(zuckerberg_train) test_df = jobs_test.unionAll(zuckerberg_test) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") p = Pipeline(stages=[featurizer, lr]) p_model = p.fit(train_df) predictions = p_model.transform(test_df) predictions.select("filePath", "prediction").show(truncate=False) from pyspark.ml.evaluation import MulticlassClassificationEvaluator df = p_model.transform(test_df) df.show() predictionAndLabels = df.select("prediction", "label")
from sparkdl import DeepImageFeaturizer sc = SparkContext() spark = SparkSession(sc) #imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work" imageDir = "hdfs://192.168.65.188:8020/paih/" #imageDir = "hdfs://10.0.0.7:8020/paih/" def getFileName (filePath) : fileName = os.path.basename(filePath).split(".")[0] return fileName # Prepare Train Data tmpTrainDf = readImages(imageDir + "/train25") #tmpTrainDf = readImages(imageDir + "/test1") tmpTrainRDD = tmpTrainDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0]))) tmpTrainX = tmpTrainRDD.toDF() csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/train25.csv") #csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv") csvTrainRDD = csvTrainTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1]))) csvTrain = csvTrainRDD.toDF() finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image) featurizer = DeepImageFeaturizer(inputCol="image",outputCol="features", modelName="InceptionV3") method = LogisticRegression(maxIter=50, regParam=0.05, elasticNetParam=0.3, labelCol="label") ovr = OneVsRest(classifier = method) featureVector = featurizer.transform(finalTrainDataFrame).persist()
#dataframe for testing the classification model from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from sparkdl import DeepImageFeaturizer spark = SparkSession.builder.appName('Weather Image Classifier - Data Analysis').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.2' # make sure we have Spark 2.2+ from sparkdl import readImages img_dir = "katkam-scaled" #Read images and Create training & test DataFrames for transfer learning jobs_df = readImages(img_dir) jobs_df.show() df = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3").transform(jobs_df) df.show() i = 2 df = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df).transform(df) jobs_train, jobs_test = df.randomSplit([0.6, 0.4]) lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") p_model = lr.fit(jobs_train) predictions = p_model.transform(jobs_test) predictions.select("filePath", "prediction").show(truncate=False)
# pyspark --packages databricks:spark-deep-learning:0.1.0-spark2.1-s_2.11 # https://github.com/databricks/spark-deep-learning/issues/18 # from sparkdl import readImages import sparkdl image_df = sparkdl.readImages( "/Users/502677522/Class/DataWeekend/2017_12_02/data/test/1.jpg") image_df.show() from keras.applications.inception_v3 import preprocess_input from keras.preprocessing.image import img_to_array, load_img import numpy as np import os from sparkdl import KerasImageFileTransformer def loadAndPreprocessKerasInceptionV3(uri): # this is a typical way to load and prep images in keras image = img_to_array(load_img(uri, target_size=(299, 299))) image = np.expand_dims(image, axis=0) return preprocess_input(image) uri = "/Users/502677522/Class/DataWeekend/2017_12_02/data/test/1.jpg" dat = loadAndPreprocessKerasInceptionV3(uri) # >> > dat.shape # (1, 299, 299, 3)
#sc = SparkContext() #spark = SparkSession(sc) imageDir = "/media/prateek/2A8BEA421AC55874/PAIH/work/" #imageDir = "hdfs://192.168.65.188:8020/paih/" def getFileName(filePath): fileName = os.path.basename(filePath).split(".")[0] return fileName # Prepare Test Data # Prepare Test Data tmpTestDf = readImages(imageDir + "train25_2") tmpTestRDD = tmpTestDf.rdd.map( lambda x: Row(filepath=x[0], image=x[1], fileName=getFileName(x[0]))) tmptestX = tmpTestRDD.toDF() csvTestTmp = spark.read.format("csv").option( "header", "true").load(imageDir + "train25_2.csv") csvTestRDD = csvTestTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1]))) csvTest = csvTestRDD.toDF() finalTestDataFrame = tmptestX.join(csvTest, tmptestX.fileName == csvTest.image, 'inner').drop(csvTest.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") featureVector = featurizer.transform(finalTestDataFrame) del tmpTestDf
# Databricks notebook source # MAGIC %md # MAGIC ![Architecture](https://s3-us-west-1.amazonaws.com/databricks-binu-mathew/image/deep_learning/jets.png) # COMMAND ---------- # MAGIC %md # MAGIC ![Architecture](https://s3-us-west-1.amazonaws.com/databricks-binu-mathew/image/deep_learning/11.png) # COMMAND ---------- from sparkdl import readImages from pyspark.sql.functions import lit img_dir = '/tmp/demo/planes' passenger_jets_train_df = readImages(img_dir + "/passenger_jets").withColumn( "label", lit(1)) fighter_jets_train_df = readImages(img_dir + "/fighter_jets").withColumn( "label", lit(0)) #dataframe for training a classification model train_df = passenger_jets_train_df.unionAll(fighter_jets_train_df) #dataframe for testing the classification model test_df = readImages('/bmathew/test_data') # COMMAND ---------- import sys sys.stdout.isatty = lambda: False sys.stdout.encoding = 'utf-8'