def _fit(self, dataset): keyCols = self.getOrDefault("keyCols") xCol = self.getOrDefault("xCol") yCol = self.getOrDefault("yCol") isLabelled = yCol is not None estimatorType = self.getOrDefault("estimatorType") assert isLabelled == (estimatorType == "predictor"), \ "yCol is {}, but it should {}be None for a {} estimatorType".format( yCol, "not " if isLabelled else "", estimatorType) _validateXCol(dataset.schema, xCol) cols = keyCols[:] cols.append(xCol) if isLabelled: cols.append(yCol) oneDimensional = _isOneDimensional(dataset.schema, xCol) projected = dataset.select(*cols) # also verifies all cols are present outputSchema = StructType().add("estimator", _SparkSklearnEstimatorUDT.sqlType()) grouped = projected.groupBy(*keyCols) estimator = self.getOrDefault("sklearnEstimator") # Potential optimization: broadcast estimator import pandas as pd def fitEstimator(_, pandasDF): X = _prepareXCol(pandasDF[xCol], oneDimensional) y = pandasDF[yCol].values if isLabelled else None # Potential optimization - del pandasDF estimatorClone = sklearn.base.clone(estimator) estimatorClone.fit(X, y) pickled = pickle.dumps(estimatorClone) # Potential optimization - del estimatorClone # Until SPARK-15989 is resolved, we can't output the sklearn UDT directly here. return pd.DataFrame.from_records([(pickled,)]) fitted = gapply(grouped, fitEstimator, outputSchema) extractSklearn = udf(lambda estimatorStr: SparkSklearnEstimator(pickle.loads(estimatorStr)), SparkSklearnEstimator.__UDT__) keyedSklearnEstimators = fitted.select( *chain(keyCols, [extractSklearn(fitted["estimator"]).alias("estimator")])) if isLabelled: outputType = dataset.schema[yCol].dataType else: outputType = Vector.__UDT__ return KeyedModel(keyCols=keyCols, xCol=xCol, outputCol=self.getOrDefault("outputCol"), yCol=yCol, estimatorType=self.getOrDefault("estimatorType"), keyedSklearnEstimators=keyedSklearnEstimators, outputType=outputType)
def _fit(self, dataset): keyCols = self.getOrDefault("keyCols") xCol = self.getOrDefault("xCol") yCol = self.getOrDefault("yCol") isLabelled = yCol is not None estimatorType = self.getOrDefault("estimatorType") assert isLabelled == (estimatorType == "predictor"), \ "yCol is {}, but it should {}be None for a {} estimatorType".format( yCol, "not " if isLabelled else "", estimatorType) _validateXCol(dataset.schema, xCol) cols = keyCols[:] cols.append(xCol) if isLabelled: cols.append(yCol) oneDimensional = _isOneDimensional(dataset.schema, xCol) projected = dataset.select(*cols) # also verifies all cols are present outputSchema = StructType().add("estimator", _SparkSklearnEstimatorUDT.sqlType()) grouped = projected.groupBy(*keyCols) estimator = self.getOrDefault("sklearnEstimator") # Potential optimization: broadcast estimator import pandas as pd def fitEstimator(_, pandasDF): X = _prepareXCol(pandasDF[xCol], oneDimensional) y = pandasDF[yCol].values if isLabelled else None # Potential optimization - del pandasDF estimatorClone = sklearn.base.clone(estimator) estimatorClone.fit(X, y) pickled = pickle.dumps(estimatorClone) # Potential optimization - del estimatorClone # Until SPARK-15989 is resolved, we can't output the sklearn UDT directly here. return pd.DataFrame.from_records([(pickled, )]) fitted = gapply(grouped, fitEstimator, outputSchema) extractSklearn = udf( lambda estimatorStr: SparkSklearnEstimator( pickle.loads(estimatorStr)), SparkSklearnEstimator.__UDT__) keyedSklearnEstimators = fitted.select(*chain( keyCols, [extractSklearn(fitted["estimator"]).alias("estimator")])) if isLabelled: outputType = dataset.schema[yCol].dataType else: outputType = Vector.__UDT__ return KeyedModel(keyCols=keyCols, xCol=xCol, outputCol=self.getOrDefault("outputCol"), yCol=yCol, estimatorType=self.getOrDefault("estimatorType"), keyedSklearnEstimators=keyedSklearnEstimators, outputType=outputType)
def _fit(self, dataset): keyCols = self.getOrDefault("keyCols") xCol = self.getOrDefault("xCol") yCol = self.getOrDefault("yCol") isLabelled = yCol is not None estimatorType = self.getOrDefault("estimatorType") assert isLabelled == (estimatorType == "predictor"), \ "yCol is {}, but it should {}be None for a {} estimatorType".format( yCol, "not " if isLabelled else "", estimatorType) _validateXCol(dataset.schema, xCol) cols = keyCols[:] cols.append(xCol) if isLabelled: cols.append(yCol) oneDimensional = _isOneDimensional(dataset.schema, xCol) projected = dataset.select(*cols) # also verifies all cols are present outputSchema = StructType().add("estimator", _SparkSklearnEstimatorUDT.sqlType()) grouped = projected.groupBy(*keyCols) estimator = self.getOrDefault("sklearnEstimator") # Potential optimization: broadcast estimator # Potential optimization (perhaps better on gapply() level): Currently, # batched python evaluation may cause OOM if two large key groups are put on one # machine. (1) Key groups should be (1) evenly distributed. (2) gapply could make # smarter use of memory and reduce copies. (3) Batched python function evaluation # can be smart on its data handoff to python - perhaps it could set up a pipe # with the python process for per-row data loading. import pandas as pd def fitEstimator(_, pandasDF): X = _prepareXCol(pandasDF[xCol], oneDimensional) y = pandasDF[yCol].values if isLabelled else None # Potential optimization - del pandasDF estimatorClone = sklearn.base.clone(estimator) if y is None: estimatorClone.fit(X) # fit may have 1 argument (e.g., sklearn.cluster.bicluster) else: estimatorClone.fit(X, y) pickled = pickle.dumps(estimatorClone) # Potential optimization - del estimatorClone # Until SPARK-15989 is resolved, we can't output the sklearn UDT directly here. return pd.DataFrame.from_records([(pickled,)]) fitted = gapply(grouped, fitEstimator, outputSchema) extractSklearn = udf(lambda estimatorStr: SparkSklearnEstimator(pickle.loads(estimatorStr)), SparkSklearnEstimator.__UDT__) keyedSklearnEstimators = fitted.select( *chain(keyCols, [extractSklearn(fitted["estimator"]).alias("estimator")])) if isLabelled: assert estimatorType == "predictor", estimatorType outputType = dataset.schema[yCol].dataType elif estimatorType == "clusterer": outputType = LongType() else: assert estimatorType == "transformer", estimatorType outputType = Vector.__UDT__ return KeyedModel(sklearnEstimator=estimator, keyCols=keyCols, xCol=xCol, outputCol=self.getOrDefault("outputCol"), yCol=yCol, estimatorType=estimatorType, keyedSklearnEstimators=keyedSklearnEstimators, outputType=outputType)
def _fit(self, dataset): keyCols = self.getOrDefault("keyCols") xCol = self.getOrDefault("xCol") yCol = self.getOrDefault("yCol") isLabelled = yCol is not None estimatorType = self.getOrDefault("estimatorType") assert isLabelled == (estimatorType == "predictor"), \ "yCol is {}, but it should {}be None for a {} estimatorType".format( yCol, "not " if isLabelled else "", estimatorType) _validateXCol(dataset.schema, xCol) cols = keyCols[:] cols.append(xCol) if isLabelled: cols.append(yCol) oneDimensional = _isOneDimensional(dataset.schema, xCol) projected = dataset.select(*cols) # also verifies all cols are present outputSchema = StructType().add("estimator", _SparkSklearnEstimatorUDT.sqlType()) grouped = projected.groupBy(*keyCols) estimator = self.getOrDefault("sklearnEstimator") # Potential optimization: broadcast estimator # Potential optimization (perhaps better on gapply() level): Currently, # batched python evaluation may cause OOM if two large key groups are put on one # machine. (1) Key groups should be (1) evenly distributed. (2) gapply could make # smarter use of memory and reduce copies. (3) Batched python function evaluation # can be smart on its data handoff to python - perhaps it could set up a pipe # with the python process for per-row data loading. import pandas as pd def fitEstimator(_, pandasDF): X = _prepareXCol(pandasDF[xCol], oneDimensional) y = pandasDF[yCol].values if isLabelled else None # Potential optimization - del pandasDF estimatorClone = sklearn.base.clone(estimator) if y is None: estimatorClone.fit( X ) # fit may have 1 argument (e.g., sklearn.cluster.bicluster) else: estimatorClone.fit(X, y) pickled = pickle.dumps(estimatorClone) # Potential optimization - del estimatorClone # Until SPARK-15989 is resolved, we can't output the sklearn UDT directly here. return pd.DataFrame.from_records([(pickled, )]) fitted = gapply(grouped, fitEstimator, outputSchema) extractSklearn = udf( lambda estimatorStr: SparkSklearnEstimator( pickle.loads(estimatorStr)), SparkSklearnEstimator.__UDT__) keyedSklearnEstimators = fitted.select(*chain( keyCols, [extractSklearn(fitted["estimator"]).alias("estimator")])) if isLabelled: assert estimatorType == "predictor", estimatorType outputType = dataset.schema[yCol].dataType elif estimatorType == "clusterer": outputType = LongType() else: assert estimatorType == "transformer", estimatorType outputType = Vector.__UDT__ return KeyedModel(sklearnEstimator=estimator, keyCols=keyCols, xCol=xCol, outputCol=self.getOrDefault("outputCol"), yCol=yCol, estimatorType=estimatorType, keyedSklearnEstimators=keyedSklearnEstimators, outputType=outputType)