def test_inceptionV3_featurization(self): output_col = "prediction" transformer = DeepImageFeaturizer(inputCol="image", outputCol=output_col, modelName="InceptionV3") image_df = getSampleImageDF() transformed_df = transformer.transform(image_df.limit(5)) collected = transformed_df.collect() for row in collected: predictions = row[output_col] self.assertEqual(len(predictions), InceptionV3Constants.NUM_OUTPUT_FEATURES)
def test_featurization(self): """ Tests that featurizer returns (almost) the same values as Keras. """ output_col = "prediction" transformer = DeepImageFeaturizer(inputCol="image", outputCol=output_col, modelName=self.name) transformed_df = transformer.transform(self.imageDF) collected = transformed_df.collect() features = np.array([i.prediction for i in collected]) # Note: keras features may be multi-dimensional np arrays, but transformer features # will be 1-d vectors. Regardless, the dimensions should add up to the same. self.assertEqual(np.prod(self.kerasFeatures.shape), np.prod(features.shape)) kerasReshaped = self.kerasFeatures.reshape(self.kerasFeatures.shape[0], -1) np.testing.assert_array_almost_equal(kerasReshaped, features, decimal=6)
def test_featurization(self): """ Tests that featurizer returns (almost) the same values as Keras. """ # Since we use different libraries for image resizing (PIL in python vs. java.awt.Image in scala), # the result will not match keras exactly. In fact the best we can do is a "somewhat similar" result. # At least compare cosine distance is < 1e-2 featurizer_sc = DeepImageFeaturizer(modelName=self.name, inputCol="image", outputCol="features", scaleHint="SCALE_FAST") features_sc = np.array([i.features for i in featurizer_sc.transform( self.imageDF).select("features").collect()]) kerasReshaped = self.kerasFeatures.reshape(self.kerasFeatures.shape[0], -1) diffs = [ spatial.distance.cosine( kerasReshaped[i], features_sc[i]) for i in range( len(features_sc))] np.testing.assert_array_almost_equal(0, diffs, decimal=self.featurizerCompareDigitsCosine)
def test_inception(self): transformer0 = DeepImageFeaturizer(inputCol='image', modelName="InceptionV3", outputCol="features0", scaleHint="SCALE_FAST") dst_path = os.path.join(self.tempdir, "featurizer") transformer0.save(dst_path) transformer1 = DeepImageFeaturizer.load(dst_path) self.assertEqual(transformer0.uid, transformer1.uid) self.assertEqual(type(transformer0.uid), type(transformer1.uid)) for x in transformer0._paramMap.keys(): self.assertEqual(transformer1.uid, x.parent, "Loaded DeepImageFeaturizer instance uid (%s) did not match Param's uid (%s)" % (transformer1.uid, transformer1.scaleHint.parent)) self.assertEqual(transformer0._paramMap, transformer1._paramMap, "Loaded DeepImageFeaturizer instance params (%s) did not match " % str(transformer1._paramMap) + "original values (%s)" % str(transformer0._paramMap)) self.assertEqual(transformer0._defaultParamMap, transformer1._defaultParamMap, "Loaded DeepImageFeaturizer instance default params (%s) did not match " % str(transformer1._defaultParamMap) + "original defaults (%s)" % str(transformer0._defaultParamMap))
def test_featurization_no_reshape(self): """ Run sparkDL predictor on manually-resized images and compare result to the keras result. """ imageArray = self.imageArray # test: predictor vs keras on resized images rdd = self.sc.parallelize([self._rowWithImage(img) for img in imageArray]) dfType = ImageSchema.imageSchema imageDf = rdd.toDF(dfType) if self.numPartitionsOverride: imageDf = imageDf.coalesce(self.numPartitionsOverride) transformer = DeepImageFeaturizer(inputCol='image', modelName=self.name, outputCol="features") dfFeatures = transformer.transform(imageDf).collect() dfFeatures = np.array([i.features for i in dfFeatures]) kerasReshaped = self.kerasFeatures.reshape(self.kerasFeatures.shape[0], -1) np.testing.assert_array_almost_equal(kerasReshaped, dfFeatures, decimal=self.featurizerCompareDigitsExact)
def test_featurizer_in_pipeline(self): """ Tests that featurizer fits into an MLlib Pipeline. Does not test how good the featurization is for generalization. """ featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName=self.name) lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") pipeline = Pipeline(stages=[featurizer, lr]) # add arbitrary labels to run logistic regression # TODO: it's weird that the test fails on some combinations of labels. check why. label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType()) train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"])) lrModel = pipeline.fit(train_df) # see if we at least get the training examples right. # with 5 examples and e.g. 131k features (for InceptionV3), it ought to. pred_df_collected = lrModel.transform(train_df).collect() for row in pred_df_collected: self.assertEqual(int(row.prediction), row.label)