def test_surprise_key(self): ke = KeyedEstimator(sklearnEstimator=PCA()) schema = StructType().add("features", LongType()).add("key", LongType()) df = self.spark.createDataFrame([], schema) km = ke.fit(df) self.assertEqual(km.keyedModels.collect(), []) self.assertEqual(km.keyedModels.dtypes, [("key", LongType().simpleString()), ("estimator", "sklearn-estimator")]) df = self.spark.createDataFrame([(1, 2)], schema) df = km.transform(df) self.assertEqual(df.collect(), [(1, 2, None)]) self.assertEqual(df.dtypes, [("features", "bigint"), ("key", "bigint"), ("output", "vector")])
def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen, **kwargs): NUSERS = 10 # featureGen() should generate a np rank-1 ndarray of equal length # labelGen() should generate a scalar assert (labelGen is not None) == ("yCol" in kwargs) isPredictor = labelGen is not None # sklearn's LinearRegression estimator is stable even if undetermined. # User keys are just [0, NUSERS), repeated for each key if there are multiple columns. # The i-th user has i examples. keyCols = kwargs.get("keyCols", KeyedEstimator._paramSpecs["keyCols"]["default"]) outputCol = kwargs.get("outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"]) xCol = kwargs.get("xCol", KeyedEstimator._paramSpecs["xCol"]["default"]) nExamplesPerUser = lambda i: max(minExamples, i + 1) userKeys = [[i for _ in keyCols] for i in range(NUSERS)] features = [[featureGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] useless = [["useless col" for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] if isPredictor: labels = [[labelGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] else: labels = None Xs = [np.vstack(x) for x in features] ys = [np.array(y) for y in labels] if isPredictor else repeat(None) localEstimators = [sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y) for X, y in zip(Xs, ys)] expectedDF = pd.DataFrame(userKeys, columns=keyCols) expectedDF["estimator"] = localEstimators def flattenAndConvertNumpy(x): return [Vectors.dense(i) if isinstance(i, np.ndarray) else i for i in chain.from_iterable(x)] inputDF = pd.DataFrame.from_dict( {k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))] for k in keyCols}) inputDF[xCol] = flattenAndConvertNumpy(features) inputDF["useless"] = flattenAndConvertNumpy(useless) if labels: inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels) inputDF = self.spark.createDataFrame(inputDF) ke = KeyedEstimator(**kwargs) km = ke.fit(inputDF) actualDF = km.keyedModels.toPandas() _assertPandasAlmostEqual(actualDF, expectedDF, keyCols) # Test users with different amounts of points. nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4 testFeatures = [[featureGen() for _ in range(nTestPerUser(i))] for i in range(NUSERS)] # "useless" column has nothing to do with computation, but is essential for keeping order # the same between the spark and non-spark versions useless = [range(nTestPerUser(i)) for i in range(NUSERS)] inputDF = pd.DataFrame.from_dict( {k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))] for k in keyCols}) inputDF[xCol] = flattenAndConvertNumpy(testFeatures) inputDF["useless"] = flattenAndConvertNumpy(useless) estimatorType = km.sklearnEstimatorType # tested to be correct elsewhere def makeOutput(estimator, X): if estimatorType == "transformer": return estimator.transform(X) else: assert estimatorType == "predictor" or estimatorType == "clusterer" return estimator.predict(X).tolist() Xs = [np.vstack(x) for x in testFeatures] expectedOutput = map(makeOutput, localEstimators, Xs) expectedDF = inputDF.copy(deep=True) expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput) inputDF = self.spark.createDataFrame(inputDF) actualDF = km.transform(inputDF).toPandas() _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])
def checkKeyedModelEquivalent(self, minExamples, featureGen, labelGen, **kwargs): NUSERS = 10 # featureGen() should generate a np rank-1 ndarray of equal length # labelGen() should generate a scalar assert (labelGen is not None) == ("yCol" in kwargs) isPredictor = labelGen is not None # sklearn's LinearRegression estimator is stable even if undetermined. # User keys are just [0, NUSERS), repeated for each key if there are multiple columns. # The i-th user has i examples. keyCols = kwargs.get("keyCols", KeyedEstimator._paramSpecs["keyCols"]["default"]) outputCol = kwargs.get( "outputCol", KeyedEstimator._paramSpecs["outputCol"]["default"]) xCol = kwargs.get("xCol", KeyedEstimator._paramSpecs["xCol"]["default"]) nExamplesPerUser = lambda i: max(minExamples, i + 1) userKeys = [[i for _ in keyCols] for i in range(NUSERS)] features = [[featureGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] useless = [["useless col" for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] if isPredictor: labels = [[labelGen() for _ in range(nExamplesPerUser(i))] for i in range(NUSERS)] else: labels = None Xs = [np.vstack(x) for x in features] ys = [np.array(y) for y in labels] if isPredictor else repeat(None) localEstimators = [ sklearn.base.clone(kwargs["sklearnEstimator"]).fit(X, y) for X, y in zip(Xs, ys) ] expectedDF = pd.DataFrame(userKeys, columns=keyCols) expectedDF["estimator"] = localEstimators def flattenAndConvertNumpy(x): return [ Vectors.dense(i) if isinstance(i, np.ndarray) else i for i in chain.from_iterable(x) ] inputDF = pd.DataFrame.from_dict({ k: [i for i in range(NUSERS) for _ in range(nExamplesPerUser(i))] for k in keyCols }) inputDF[xCol] = flattenAndConvertNumpy(features) inputDF["useless"] = flattenAndConvertNumpy(useless) if labels: inputDF[kwargs["yCol"]] = flattenAndConvertNumpy(labels) inputDF = self.spark.createDataFrame(inputDF) ke = KeyedEstimator(**kwargs) km = ke.fit(inputDF) actualDF = km.keyedModels.toPandas() _assertPandasAlmostEqual(actualDF, expectedDF, keyCols) # Test users with different amounts of points. nTestPerUser = lambda i: NUSERS // 4 if i < NUSERS // 2 else NUSERS * 3 // 4 testFeatures = [[featureGen() for _ in range(nTestPerUser(i))] for i in range(NUSERS)] # "useless" column has nothing to do with computation, but is essential for keeping order # the same between the spark and non-spark versions useless = [range(nTestPerUser(i)) for i in range(NUSERS)] inputDF = pd.DataFrame.from_dict({ k: [i for i in range(NUSERS) for _ in range(nTestPerUser(i))] for k in keyCols }) inputDF[xCol] = flattenAndConvertNumpy(testFeatures) inputDF["useless"] = flattenAndConvertNumpy(useless) estimatorType = km.sklearnEstimatorType # tested to be correct elsewhere def makeOutput(estimator, X): if estimatorType == "transformer": return estimator.transform(X) else: assert estimatorType == "predictor" or estimatorType == "clusterer" return estimator.predict(X).tolist() Xs = [np.vstack(x) for x in testFeatures] expectedOutput = map(makeOutput, localEstimators, Xs) expectedDF = inputDF.copy(deep=True) expectedDF[outputCol] = flattenAndConvertNumpy(expectedOutput) inputDF = self.spark.createDataFrame(inputDF) actualDF = km.transform(inputDF).toPandas() _assertPandasAlmostEqual(actualDF, expectedDF, keyCols + ["useless"])