def test_binarizer(self): b0 = Binarizer() self.assertListEqual(b0.params, [ b0.inputCol, b0.inputCols, b0.outputCol, b0.outputCols, b0.threshold, b0.thresholds ]) self.assertTrue(all([~b0.isSet(p) for p in b0.params])) self.assertTrue(b0.hasDefault(b0.threshold)) self.assertEqual(b0.getThreshold(), 0.0) b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0) self.assertTrue(not all([b0.isSet(p) for p in b0.params])) self.assertEqual(b0.getThreshold(), 1.0) self.assertEqual(b0.getInputCol(), "input") self.assertEqual(b0.getOutputCol(), "output") b0c = b0.copy({b0.threshold: 2.0}) self.assertEqual(b0c.uid, b0.uid) self.assertListEqual(b0c.params, b0.params) self.assertEqual(b0c.getThreshold(), 2.0) b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output") self.assertNotEqual(b1.uid, b0.uid) self.assertEqual(b1.getThreshold(), 2.0) self.assertEqual(b1.getInputCol(), "input") self.assertEqual(b1.getOutputCol(), "output")
def binarization_by_threshold(dataFrame, threshold, inputCol): # 对连续值根据阈值threshold二值化 binarizer = Binarizer(threshold=threshold, inputCol=inputCol, outputCol='%s_binarized' % (inputCol)) binarizedDataFrame = binarizer.transform(dataFrame) print('Binarizer output with Threshold = %f' % binarizer.getThreshold()) return binarizedDataFrame
def pre_processing(continuousDataFrame): binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature") binarizedDataFrame = binarizer.transform(continuousDataFrame) print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) binarizedDataFrame.show()
def binarizer(self, df, column): """ 按指定阈值 二值化Binarizer """ # 对连续值根据阈值threshold二值化 binarizer = Binarizer(threshold=5.1, inputCol=column, outputCol=column + '_binarized_feature') binarizedDataFrame = binarizer.transform(df) print('Binarizer output with Threshold = %f' % binarizer.getThreshold()) return binarizedDataFrame
def test_binarizer(self): b0 = Binarizer() self.assertListEqual(b0.params, [b0.inputCol, b0.outputCol, b0.threshold]) self.assertTrue(all([~b0.isSet(p) for p in b0.params])) self.assertTrue(b0.hasDefault(b0.threshold)) self.assertEqual(b0.getThreshold(), 0.0) b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0) self.assertTrue(all([b0.isSet(p) for p in b0.params])) self.assertEqual(b0.getThreshold(), 1.0) self.assertEqual(b0.getInputCol(), "input") self.assertEqual(b0.getOutputCol(), "output") b0c = b0.copy({b0.threshold: 2.0}) self.assertEqual(b0c.uid, b0.uid) self.assertListEqual(b0c.params, b0.params) self.assertEqual(b0c.getThreshold(), 2.0) b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output") self.assertNotEqual(b1.uid, b0.uid) self.assertEqual(b1.getThreshold(), 2.0) self.assertEqual(b1.getInputCol(), "input") self.assertEqual(b1.getOutputCol(), "output")
# COMMAND ---------- ###Binarizer, takes the numerical inputs and converts them into binary output (0 and 1) with respect to the threshold provided from pyspark.ml.feature import Binarizer continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)], ["id", "feature"]) binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature") binarizedDataFrame = binarizer.transform(continuousDataFrame) print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) binarizedDataFrame.show() # COMMAND ---------- ###PCA is a statistical procedure used to reduce the vector's dimensions. This example reduces a 5 dimensional feature into a 3 dimensional pca feature from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] df = spark.createDataFrame(data, ["features"]) pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") model = pca.fit(df)
from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Binarizer # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("BinarizerExample")\ .getOrCreate() # $example on$ continuousDataFrame = spark.createDataFrame([ (0, 0.1), (1, 0.8), (2, 0.2) ], ["id", "feature"]) binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature") binarizedDataFrame = binarizer.transform(continuousDataFrame) print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) binarizedDataFrame.show() # $example off$ spark.stop()
from pyspark.ml.feature import StopWordsRemover, NGram, Binarizer from pyspark.sql import SparkSession spark = SparkSession.builder.appName("stopworkremover").getOrCreate() seqdata = spark.createDataFrame([(0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"])], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="Filtered") remover.transform(seqdata).show(truncate=False) wordDataFrame = spark.createDataFrame( [(0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"])], ["id", "words"]) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") ngramdf = ngram.transform(wordDataFrame) ngramdf.select("ngrams").show(truncate=False) continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)], ["id", "feature"]) binarizer = Binarizer(threshold=0.1, inputCol="feature", outputCol="binarized_feature") bnzrDataframe = binarizer.transform(continuousDataFrame) print("binarizer threshold", binarizer.getThreshold()) bnzrDataframe.show(truncate=False)