def test_inceptionV3_featurization(self):
        output_col = "prediction"
        transformer = DeepImageFeaturizer(inputCol="image", outputCol=output_col,
                                          modelName="InceptionV3")

        image_df = getSampleImageDF()
        transformed_df = transformer.transform(image_df.limit(5))

        collected = transformed_df.collect()
        for row in collected:
            predictions = row[output_col]
            self.assertEqual(len(predictions), InceptionV3Constants.NUM_OUTPUT_FEATURES)
    def test_featurization(self):
        """
        Tests that featurizer returns (almost) the same values as Keras.
        """
        output_col = "prediction"
        transformer = DeepImageFeaturizer(inputCol="image", outputCol=output_col,
                                          modelName=self.name)
        transformed_df = transformer.transform(self.imageDF)
        collected = transformed_df.collect()
        features = np.array([i.prediction for i in collected])

        # Note: keras features may be multi-dimensional np arrays, but transformer features
        # will be 1-d vectors. Regardless, the dimensions should add up to the same.
        self.assertEqual(np.prod(self.kerasFeatures.shape), np.prod(features.shape))
        kerasReshaped = self.kerasFeatures.reshape(self.kerasFeatures.shape[0], -1)
        np.testing.assert_array_almost_equal(kerasReshaped, features, decimal=6)
 def test_featurization(self):
     """
     Tests that featurizer returns (almost) the same values as Keras.
     """
     # Since we use different libraries for image resizing (PIL in python vs. java.awt.Image in scala),
     # the result will not match keras exactly. In fact the best we can do is a "somewhat similar" result.
     # At least compare cosine distance is < 1e-2
     featurizer_sc = DeepImageFeaturizer(modelName=self.name, inputCol="image",
                                         outputCol="features", scaleHint="SCALE_FAST")
     features_sc = np.array([i.features for i in featurizer_sc.transform(
         self.imageDF).select("features").collect()])
     kerasReshaped = self.kerasFeatures.reshape(self.kerasFeatures.shape[0], -1)
     diffs = [
         spatial.distance.cosine(
             kerasReshaped[i],
             features_sc[i]) for i in range(
             len(features_sc))]
     np.testing.assert_array_almost_equal(0, diffs, decimal=self.featurizerCompareDigitsCosine)
 def test_inception(self):
     transformer0 = DeepImageFeaturizer(inputCol='image', modelName="InceptionV3",
                                        outputCol="features0", scaleHint="SCALE_FAST")
     dst_path = os.path.join(self.tempdir, "featurizer")
     transformer0.save(dst_path)
     transformer1 = DeepImageFeaturizer.load(dst_path)
     self.assertEqual(transformer0.uid, transformer1.uid)
     self.assertEqual(type(transformer0.uid), type(transformer1.uid))
     for x in transformer0._paramMap.keys():
         self.assertEqual(transformer1.uid, x.parent,
                          "Loaded DeepImageFeaturizer instance uid (%s) did not match Param's uid (%s)"
                          % (transformer1.uid, transformer1.scaleHint.parent))
     self.assertEqual(transformer0._paramMap, transformer1._paramMap,
                      "Loaded DeepImageFeaturizer instance params (%s) did not match "
                      % str(transformer1._paramMap) +
                      "original values (%s)" % str(transformer0._paramMap))
     self.assertEqual(transformer0._defaultParamMap, transformer1._defaultParamMap,
                      "Loaded DeepImageFeaturizer instance default params (%s) did not match "
                      % str(transformer1._defaultParamMap) +
                      "original defaults (%s)" % str(transformer0._defaultParamMap))
 def test_featurization_no_reshape(self):
     """
     Run sparkDL predictor on manually-resized images and compare result to the
     keras result.
     """
     imageArray = self.imageArray
     # test: predictor vs keras on resized images
     rdd = self.sc.parallelize([self._rowWithImage(img) for img in imageArray])
     dfType = ImageSchema.imageSchema
     imageDf = rdd.toDF(dfType)
     if self.numPartitionsOverride:
         imageDf = imageDf.coalesce(self.numPartitionsOverride)
     transformer = DeepImageFeaturizer(inputCol='image', modelName=self.name,
                                       outputCol="features")
     dfFeatures = transformer.transform(imageDf).collect()
     dfFeatures = np.array([i.features for i in dfFeatures])
     kerasReshaped = self.kerasFeatures.reshape(self.kerasFeatures.shape[0], -1)
     np.testing.assert_array_almost_equal(kerasReshaped,
                                          dfFeatures,
                                          decimal=self.featurizerCompareDigitsExact)
    def test_featurization(self):
        """
        Tests that featurizer returns (almost) the same values as Keras.
        """
        output_col = "prediction"
        transformer = DeepImageFeaturizer(inputCol="image",
                                          outputCol=output_col,
                                          modelName=self.name)
        transformed_df = transformer.transform(self.imageDF)
        collected = transformed_df.collect()
        features = np.array([i.prediction for i in collected])

        # Note: keras features may be multi-dimensional np arrays, but transformer features
        # will be 1-d vectors. Regardless, the dimensions should add up to the same.
        self.assertEqual(np.prod(self.kerasFeatures.shape),
                         np.prod(features.shape))
        kerasReshaped = self.kerasFeatures.reshape(self.kerasFeatures.shape[0],
                                                   -1)
        np.testing.assert_array_almost_equal(kerasReshaped,
                                             features,
                                             decimal=6)
    def test_featurizer_in_pipeline(self):
        """
        Tests that featurizer fits into an MLlib Pipeline.
        Does not test how good the featurization is for generalization.
        """
        featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features",
                                         modelName=self.name)
        lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
        pipeline = Pipeline(stages=[featurizer, lr])

        # add arbitrary labels to run logistic regression
        # TODO: it's weird that the test fails on some combinations of labels. check why.
        label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType())
        train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"]))

        lrModel = pipeline.fit(train_df)
        # see if we at least get the training examples right.
        # with 5 examples and e.g. 131k features (for InceptionV3), it ought to.
        pred_df_collected = lrModel.transform(train_df).collect()
        for row in pred_df_collected:
            self.assertEqual(int(row.prediction), row.label)