def test_preserve_set_state(self): dataset = self.spark.createDataFrame([(0.5,)], ["data"]) binarizer = Binarizer(inputCol="data") self.assertFalse(binarizer.isSet("threshold")) binarizer.transform(dataset) binarizer._transfer_params_from_java() self.assertFalse(binarizer.isSet("threshold"), "Params not explicitly set should remain unset after transform")
def test_preserve_set_state(self): dataset = self.spark.createDataFrame([(0.5,)], ["data"]) binarizer = Binarizer(inputCol="data") self.assertFalse(binarizer.isSet("threshold")) binarizer.transform(dataset) binarizer._transfer_params_from_java() self.assertFalse(binarizer.isSet("threshold"), "Params not explicitly set should remain unset after transform")
def naiveOutliers(self, df, c): binazer_2sdu = Binarizer(threshold = 2.0, inputCol=c, outputCol="2SDU_"+c) binazer_3sdu = Binarizer(threshold = 3.0, inputCol=c, outputCol="3SDU_"+c) binazer_2sdd = Binarizer(threshold = 2.0, inputCol=c, outputCol="2SDD_"+c) binazer_3sdd = Binarizer(threshold = 3.0, inputCol=c, outputCol="3SDD_"+c) df = binazer_2sdu.transform(df.select('snapshotDate','ID',c)) df = binazer_3sdu.transform(df) df = df.withColumn(c, -1.0 * df[c]) df = binazer_2sdd.transform(df) df = binazer_3sdd.transform(df) return( df.select('snapshotDate','ID','2SDU_'+c, '3SDU_'+c, '2SDD_'+c, '3SDD_'+c,) )
def performance(prediction): ''' performance of model ''' binarizer = Binarizer(threshold=0.5, inputCol="prediction", outputCol="b_prediction") binarizedDataFrame = binarizer.transform(prediction) binarizer = Binarizer(threshold=0.5, inputCol="label", outputCol="b_label") binarizedDataFrame = binarizer.transform(binarizedDataFrame) prediction_label = binarizedDataFrame.select('b_prediction', 'b_label') metrics = BinaryClassificationMetrics(prediction_label.rdd) return metrics.areaUnderROC
def best_model(algo, bin, log): mdl = algo.fit(trainingData) pred = mdl.transform(testData) if bin: bina = Binarizer(threshold=0.5, inputCol="prediction_c", outputCol="prediction") pred = bina.transform(pred) acc = evaluator_multi.evaluate(pred) area_under_curve = evaluator_bin.evaluate(pred) print("Accuracy:", acc) print("Area Under ROC:", area_under_curve) print("Top Three Features") if log: feature_importance = mdl.coefficients.values for f in np.abs(feature_importance).argsort()[-3:][::-1]: print(schemaNames[f + 1], end=" ") print("") else: feature_importance = mdl.featureImportances top_features = np.zeros(ncolumns - 1) top_features[feature_importance.indices] = feature_importance.values for f in top_features.argsort()[-3:][::-1]: print(schemaNames[f + 1], end=" ") print("") return best_model
def test_model_binarizer(self): import numpy data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)], ["id", "feature"]) model = Binarizer(inputCol='feature', outputCol='binarized') # the input name should match that of what StringIndexer.inputCol model_onnx = convert_sparkml(model, 'Sparkml Binarizer', [('feature', FloatTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("binarized").toPandas().values.astype( numpy.float32) data_np = data.select('feature').toPandas().values.astype( numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBinarizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['binarized'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def findmodel(algo, bin, log): model = algo.fit(trainingData) predictions = model.transform(testData) if bin: binarizer = Binarizer(threshold=0.5, inputCol="prediction_c", outputCol="prediction") predictions = binarizer.transform(predictions) accuracy = evaluatorM.evaluate(predictions) auc = evaluatorB.evaluate(predictions) print("Accuracy:", accuracy) print("Area Under ROC:", auc) print("Top Features") if log: fi = model.coefficients.values for i in np.abs(fi).argsort()[-3:][::-1]: print(schemaNames[i + 1], end=" ") print("") else: fi = model.featureImportances imp_feat = np.zeros(ncolumns - 1) imp_feat[fi.indices] = fi.values for i in imp_feat.argsort()[-3:][::-1]: print(schemaNames[i + 1], end=" ") print("") return model
def binarization_by_threshold(dataFrame, threshold, inputCol): # 对连续值根据阈值threshold二值化 binarizer = Binarizer(threshold=threshold, inputCol=inputCol, outputCol='%s_binarized' % (inputCol)) binarizedDataFrame = binarizer.transform(dataFrame) print('Binarizer output with Threshold = %f' % binarizer.getThreshold()) return binarizedDataFrame
def test_default_params_transferred(self): dataset = self.spark.createDataFrame([(0.5,)], ["data"]) binarizer = Binarizer(inputCol="data") # intentionally change the pyspark default, but don't set it binarizer._defaultParamMap[binarizer.outputCol] = "my_default" result = binarizer.transform(dataset).select("my_default").collect() self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0)
def test_default_params_transferred(self): dataset = self.spark.createDataFrame([(0.5, )], ["data"]) binarizer = Binarizer(inputCol="data") # intentionally change the pyspark default, but don't set it binarizer._defaultParamMap[binarizer.outputCol] = "my_default" result = binarizer.transform(dataset).select("my_default").collect() self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0)
def pre_processing(continuousDataFrame): binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature") binarizedDataFrame = binarizer.transform(continuousDataFrame) print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) binarizedDataFrame.show()
def binarizer(self, df, column): """ 按指定阈值 二值化Binarizer """ # 对连续值根据阈值threshold二值化 binarizer = Binarizer(threshold=5.1, inputCol=column, outputCol=column + '_binarized_feature') binarizedDataFrame = binarizer.transform(df) print('Binarizer output with Threshold = %f' % binarizer.getThreshold()) return binarizedDataFrame
def decisionTreeRegressor(data, ncolumns, schemaNames): from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.tuning import CrossValidator from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.feature import Binarizer from pyspark.ml.evaluation import BinaryClassificationEvaluator import numpy as np import time binarizer = Binarizer( threshold=0.00001, inputCol="features", outputCol="binarized_features", ) binarizedDataFrame = binarizer.transform(data) (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50) dtr = DecisionTreeRegressor(labelCol="label", featuresCol="binarized_features", maxDepth=10, maxBins=10, impurity='Variance') timer = '' start = time.time() cvModel = dtr.fit(trainingData) end = time.time() timer = ((end - start) / 60) prediction = cvModel.transform(testData) evaluator = RegressionEvaluator\ (labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(prediction) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") areaUC = evaluator.evaluate(prediction) fi = cvModel.featureImportances imp_feat = np.zeros(ncolumns - 1) imp_feat[fi.indices] = fi.values x = np.arange(ncolumns - 1) idx = (-imp_feat).argsort()[:3] feat = [] for i in idx: feat.append(schemaNames[i]) return feat, rmse, areaUC, timer
def prep_data(sqlContext, data, drops): """Prepares date for ML. Preparation includes: making a label column (by the rule: naacess > 10), applying drops and transforming data into LabeledPoint""" binarizer = Binarizer(threshold=10.0, inputCol="naccess", outputCol="target") data = binarizer.transform(data) drops = drops.split(",") cols = [x for x in data.columns if x not in set(drops)] data = data.select(cols) labeled = label_data(data) preped_data = sqlContext.createDataFrame(labeled, ['features','label']) return preped_data
def prep_data(sqlContext, data, drops): """Prepares date for ML. Preparation includes: making a label column (by the rule: naacess > 10), applying drops and transforming data into LabeledPoint""" binarizer = Binarizer(threshold=10.0, inputCol="naccess", outputCol="target") data = binarizer.transform(data) drops = drops.split(",") cols = [x for x in data.columns if x not in set(drops)] data = data.select(cols) labeled = label_data(data) preped_data = sqlContext.createDataFrame(labeled, ['features', 'label']) return preped_data
from pyspark.ml.feature import Binarizer from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext("local", "samp") sqlContext = SQLContext(sc) continuousDF = sqlContext.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.8)], ['label', 'features']) binarizer = Binarizer(inputCol="features", outputCol="binarized_feature", threshold=0.5) binarizedDF = binarizer.transform(continuousDF) for bf in binarizedDF.select("binarized_feature").take(3): print bf """OUTPUT Row(binarized_feature=0.0) Row(binarized_feature=1.0) Row(binarized_feature=1.0) """
print("Evaluating Oversampling dataset:") lrModel = lr.fit(o_train) prediction = lrModel.transform(o_test) rmse = eval.evaluate(prediction) print("RMSE: %.3f" % rmse) r2 = eval.evaluate(prediction, {eval.metricName: "r2"}) print("r2: %.3f" % r2) ## Random Forest ## oversampling print("Evaluating oversampling dataset Random Forest:") binarizer = Binarizer(threshold=0, inputCol="label", outputCol="bin_labels") train_bin = binarizer.transform(o_train) test_bin = binarizer.transform(o_test) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") dt = RandomForestClassifier(labelCol="bin_labels", featuresCol="pcaFeatures", numTrees=10, maxDepth=30) dtModel = dt.fit(train_bin) predictions = dtModel.transform(test_bin) accuracy = evaluator.evaluate(predictions) print("LR Accuracy = %g " % accuracy) print( 'AUC:', BinaryClassificationMetrics(predictions['label', 'prediction'].rdd).areaUnderROC)
paramGrid_regressor = ParamGridBuilder() \ .addGrid(dt1.maxDepth, [5, 10, 30]) \ .addGrid(dt1.maxBins, [20, 35, 40]) \ .build() crossval_regressor = CrossValidator(estimator=pipeline_regressor, estimatorParamMaps=paramGrid_regressor, evaluator=evaluator_regressor, numFolds=2, parallelism=3) cvModel_regressor = crossval_regressor.fit(trainingData) prediction_regressor = cvModel_regressor.transform(testData) binarizer = Binarizer(threshold=0.5, inputCol="prediction", outputCol="binarized_prediction") binarizedDataFrame = binarizer.transform(prediction_regressor) P1 = binarizedDataFrame.drop('prediction') binarizedDataFrame_1 = P1.withColumnRenamed("binarized_prediction", "prediction") evaluator_regressor1 = MulticlassClassificationEvaluator\ (labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy_regressor = evaluator_regressor1.evaluate(binarizedDataFrame_1) print("Accuracy of DecisionTreeRegressor= %g " % accuracy_regressor) evaluator_regressor_area_under_curve = BinaryClassificationEvaluator( rawPredictionCol="prediction") accuracy_regressor_area_under_curve = evaluator_regressor_area_under_curve.evaluate( prediction_regressor) print("Area Under the curve on Decision Trees for Regression ", accuracy_regressor_area_under_curve) print("Printing Parameter for DecisionTreeRegressor")
from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import Binarizer # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("BinarizerExample")\ .getOrCreate() # $example on$ continuousDataFrame = spark.createDataFrame([ (0, 0.1), (1, 0.8), (2, 0.2) ], ["id", "feature"]) binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature") binarizedDataFrame = binarizer.transform(continuousDataFrame) print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) binarizedDataFrame.show() # $example off$ spark.stop()
handleInvalid="skip") qty_df = qty_indexer.fit(ohe_df).transform(ohe_df) qty_df.select("quantity", "quantity_indexed").display() # COMMAND ---------- # MAGIC %md Binarization of continous numerical data. # COMMAND ---------- from pyspark.ml.feature import Binarizer binarizer = Binarizer(threshold=10, inputCol="unit_price", outputCol="binarized_price") binarized_df = binarizer.transform(qty_df) binarized_df.select("quantity", "binarized_price").display() # COMMAND ---------- # MAGIC %md Transforming date/time columns # COMMAND ---------- from pyspark.sql.functions import month month_df = binarized_df.withColumn("invoice_month", month("invoice_time")) month_indexer = StringIndexer(inputCol="invoice_month", outputCol="month_indexed", handleInvalid="skip") month_df = month_indexer.fit(month_df).transform(month_df)
# pyspark.ml.feature module # from pyspark.ml.feature import Binarizer df = sparksession.createDataFrame([(0.5,)], ["values"]) df.collect() binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features") df2 = binarizer.transform(df) df2.dtypes df.collect() df2.collect() binarizer.getOutputCol() rawData.take(1) binarizer2 = Binarizer(threshold=0.5, inputCol="srv_diff_host_rate", outputCol="features") binarizer2.transform(rawData) binarizer.explainParam('inputCol') binarizer.inputCol binarizer.params rawData.select(['count']).show() rawData.dtypes from pyspark.ml.feature import StringIndexer stringIndexer = StringIndexer(inputCol="y_label", outputCol='indexed_y_label') model = stringIndexer.fit(rawData) td = model.transform(rawData) td.dtypes
df = df.na.drop() # In[9]: #df.count(),len(df.columns) # Creating categorical variable: Let create a categorical variable to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1. We can create this categorical variable as a column in a DataFrame using Binarizer # In[10]: # binarizer = Binarizer(threshold=24.99999,inputCol="relative_humidity_3pm",outputCol="label") binarizer = Binarizer(threshold=24.99999, inputCol=target_col, outputCol="label") binarizedDF = binarizer.transform(df) # In[11]: #binarizedDF.describe() # # Creating target variable named label # The threshold argument specifies the threshold value for the variable, inputCol is the input column to read, and outputCol is the name of the new categorical column. The second line applies the Binarizer and creates a new DataFrame with the categorical column. We can look at the first four values in the new DataFrame: # In[12]: #binarizedDF.select("relative_humidity_3pm","label").show(4) # The first row's humidity value is greater than 25% and the label is 1. The other humidity values are less than 25% and have labels equal to 0.
from pyspark.ml.feature import Binarizer from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("binarizer").master( "local").getOrCreate() dataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.3)], ["id", "feature"]) binaizer = Binarizer(inputCol="feature", outputCol="binarizer", threshold=0.5) binarizerDataFrame = binaizer.transform(dataFrame) binarizerDataFrame.show() spark.stop()
def binarize_training_data(trainingRawData): binarizer = Binarizer() \ .setInputCol("rating") \ .setOutputCol("label") \ .setThreshold(3.5) return binarizer.transform(trainingRawData)
# Useful transformation functions: #binarizer: converts continuous variables to 1 / 0 depending on set threshold #bucketizer: ^ similar but for multi-class problems #MaxAbsScaler: rescale data between -1 and 1 range #MinMaxSacler: rescale data between 0 and 1 range #OneHotEncoder: encodes categorical column to binary vectors #PCA: self explanatory #StandardScaler: convert so mean = 0 and sd = 1 from pyspark.ml.feature import Binarizer binarizer = Binarizer(threshold=500, inputCol="Yearly Amount Spent", outputCol="label") binarizedDataFrame = binarizer.transform(final_data) binarizedDataFrame = binarizedDataFrame.drop("Yearly Amount Spent") binarizedDataFrame.show() from pyspark.ml.classification import LogisticRegression logReg = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) fitted_logReg = logReg.fit(binarizedDataFrame) # Print the coefficients and intercept for logistic regression print("Coefficients: " + str(fitted_logReg.coefficients)) print("Intercept: " + str(fitted_logReg.intercept)) #log_summary = fitted_logReg.summary() # test set predictions_and_labels = fitted_logReg.evaluate(fitted_logReg)
# ## Generate label # We can treat `star_rating` as a continuous numerical label or an ordered # categorical label: filtered.groupBy("star_rating").count().orderBy("star_rating").show() # Rather than try to predict each value, let us see if we can distinguish # between five-star and non-five-star ratings. We can use the # [Binarizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer) # to create our binary label: from pyspark.ml.feature import Binarizer converted = filtered.withColumn("star_rating", col("star_rating").cast("double")) binarizer = Binarizer(inputCol="star_rating", outputCol="high_rating", threshold = 4.5) labeled = binarizer.transform(converted) labeled.crosstab("star_rating", "high_rating").show() # **Note:** `Binarizer` does not like integer values, thus we had to convert to doubles. # ## Extract, transform, and select features # Create function to explore features: def explore(df, feature, label, plot=True): from pyspark.sql.functions import count, mean aggregated = df.groupby(feature).agg(count(label), mean(label)).orderBy(feature) aggregated.show() if plot == True: pdf = aggregated.toPandas() pdf.plot.bar(x=pdf.columns[0], y=pdf.columns[2], capsize=5)
importance["features"] = features_col importance = importance.sort_values(by='values', ascending=False) importance = importance.reset_index(drop=True) print('1st feature', importance.features[0], importance.values[0][0]) print('2nd feature', importance.features[1], importance.values[1][0]) print('3rd feature', importance.features[2], importance.values[2][0]) # train DTR dtr_total = DecisionTreeRegressor(featuresCol='features', labelCol='label', predictionCol="bin_prediction", maxDepth=15, maxBins=16, seed=111) r_model = dtr_total.fit(train_total) predictions = r_model.transform(test_total) binarizer = Binarizer(threshold=0.5, inputCol="bin_prediction", outputCol="prediction") binarizedDataFrame = binarizer.transform(predictions) accuracy = evaluator.evaluate(binarizedDataFrame) print("\n") print("Final DTR Accuracy = %g " % accuracy) print('AUC:', BinaryClassificationMetrics(binarizedDataFrame['label','prediction'].rdd).areaUnderROC) importance = pd.DataFrame(model.featureImportances.toArray(), columns=["values"]) features_col = pd.Series(feature_names) importance["features"] = features_col importance = importance.sort_values(by='values', ascending=False) importance = importance.reset_index(drop=True) print('1st feature', importance.features[0], importance.values[0][0]) print('2nd feature', importance.features[1], importance.values[1][0]) print('3rd feature', importance.features[2], importance.values[2][0]) # train lr
.appName("SQL-ML") \ .getOrCreate() """ Spark机器学习之特征提取、选择、转换 : https://blog.csdn.net/cheng9981/article/details/63280665/ 处理特征的算法,大致分为以下几组: 1、提取:从“原始”数据提取特征 2、转换:缩放,转换或修改要素 3、选择:从一组较大的要素中选择一个子集 4、局部敏感哈希(LSH):这类算法将特征变换的方面与其他算法相结合。 """ df2 = spark.createDataFrame([(0.5, 0.3), (0.5, 0.7)], ["values1", "values2"]) binarizer2 = Binarizer(thresholds=[0.0, 1.0]) binarizer2.setInputCols(["values1", "values2"]).setOutputCols(["output1", "output2"]) binarizer2.transform(df2).show() from pyspark.ml.feature import Word2Vec # Input data: Each row is a bag of words from a sentence or document. documentDF = spark.createDataFrame( [("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), )], ["text"]) # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result") model = word2Vec.fit(documentDF)
# MAGIC Divide the hosts by whether their `review_scores_rating` is above 97. Do this using the transformer `Binarizer` with the output column `high_rating`. This should create the objects `binarizer` and the transformed DataFrame `transformedBinnedDF`. # MAGIC # MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/> **Hint:** Note that `Binarizer` is a transformer, so it does not have a `.fit()` method<br> # MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/> **Hint:** See the <a href="http://spark.apache.org/docs/latest/api/python/pyspark.ml.html?highlight=binarizer#pyspark.ml.feature.Binarizer" target="_blank">Binarizer Docs</a> for more details.</a> # COMMAND ---------- from pyspark.ml.feature import Binarizer # COMMAND ---------- # TODO binarizer = Binarizer(threshold=0.97, inputCol="review_scores_rating", outputCol="high_rating") transformedBinnedDF = binarizer.transform(airbnbDF) display(transformedBinnedDF) # COMMAND ---------- # TEST - Run this cell to test your solution from pyspark.ml.feature import Binarizer dbTest("ML1-P-05-01-01", True, type(binarizer) == type(Binarizer())) dbTest("ML1-P-05-01-02", True, binarizer.getInputCol() == 'review_scores_rating') dbTest("ML1-P-05-01-03", True, binarizer.getOutputCol() == 'high_rating') dbTest("ML1-P-05-01-04", True, "high_rating" in transformedBinnedDF.columns) print("Tests passed!")
ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.select("ngrams").show(truncate=False) # COMMAND ---------- ###Binarizer, takes the numerical inputs and converts them into binary output (0 and 1) with respect to the threshold provided from pyspark.ml.feature import Binarizer continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)], ["id", "feature"]) binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature") binarizedDataFrame = binarizer.transform(continuousDataFrame) print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) binarizedDataFrame.show() # COMMAND ---------- ###PCA is a statistical procedure used to reduce the vector's dimensions. This example reduces a 5 dimensional feature into a 3 dimensional pca feature from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] df = spark.createDataFrame(data, ["features"])
zfill_cols = piv_df.columns # Zero fill the pivoted values df = df.fillna(0, subset=zfill_cols.remove('NO')) Binarizing Day of Week In a previous video, we saw that it was very unlikely for a home to list on the weekend. Let's create a new field that says if the house is listed for sale on a weekday or not. In this example there is a field called List_Day_of_Week that has Monday is labeled 1.0 and Sunday is 7.0. Let's convert this to a binary field with weekday being 0 and weekend being 1. We can use the pyspark feature transformer Binarizer to do this. # Import transformer from pyspark.ml.feature import Binarizer # Create the transformer binarizer = Binarizer(threshold=5.0,inputCol='List_Day_of_Week', outputCol='Listed_On_Weekend') # Apply the transformation to df df = binarizer.transform(df) # Verify transformation df[['List_Day_of_Week', 'Listed_On_Weekend']].show() Bucketing If you are a homeowner its very important if a house has 1, 2, 3 or 4 bedrooms. But like bathrooms, once you hit a certain point you don't really care whether the house has 7 or 8. This example we'll look at how to figure out where are some good value points to bucket. from pyspark.ml.feature import Bucketizer # Plot distribution of sample_df sns.distplot(sample_df, axlabel='BEDROOMS') plt.show() # Create the bucket splits and bucketizer splits = [ 0, 1, 2, 3, 4, 5, float('Inf')]
dtr = DecisionTreeRegressor(labelCol="label", featuresCol="features") pipeline_dtr = Pipeline(stages=[dtr]) paramGrid_reg = ParamGridBuilder() \ .addGrid(dtr.maxDepth, [5, 10, 30]) \ .addGrid(dtr.maxBins, [20, 35, 40]) \ .build() crossval_reg = CrossValidator(estimator=pipeline_dtr, estimatorParamMaps=paramGrid_reg, evaluator=evaluator_reg, numFolds=2) cvModel_reg = crossval_reg.fit(trainingData) prediction_reg = cvModel_reg.transform(testData) binarizer = Binarizer(threshold=0.5, inputCol="prediction", outputCol="binarized_prediction") binarizedDataFrame = binarizer.transform(prediction_reg) binarized = binarizedDataFrame.drop('prediction') bdf = binarized.withColumnRenamed('binarized_prediction', 'prediction') bestModel_reg = cvModel_reg.bestModel bestnewModel_reg = bestModel_reg.stages[0] bestParams_reg = bestnewModel_reg.extractParamMap() print("\n") print("The best parameters for Decision Tree Regressor are...") print("\n") for x in bestParams_reg: print(x.name, bestParams_reg[x]) print("\n") print("Printing the selected training parameters... ") print("\n") maxDepth_dtr = bestnewModel_reg._java_obj.getMaxDepth() print("Best maxDepth = ", maxDepth_dtr)
StructField('TaxiOut', DoubleType(), True), StructField('Cancelled', IntegerType(), True), StructField('CancellationCode', StringType(), True), StructField('Diverted', IntegerType(), True), StructField('CarrierDelay', DoubleType(), True), StructField('WeatherDelay', DoubleType(), True), StructField('NASDelay', DoubleType(), True), StructField('SecurityDelay', DoubleType(), True), StructField('LateAircraftDelay', DoubleType(), True) ]) air = spark.read.options(header='true').schema(schema_sdf).csv("/home/devel/2020210973chenxiao/airdelay_small.csv") #air2=air.na.drop() air1 = air.select(["ArrDelay","Year","DayofMonth","DayofWeek","DepTime","CRSDepTime","CRSArrTime","UniqueCarrier","ActualElapsedTime","Origin","Dest","Distance"]) air3=air1.na.drop() binarizer = Binarizer(threshold=0, inputCol="ArrDelay", outputCol="Delay_feature") air2 = binarizer.transform(air3) df=get_sdummies(air2,["UniqueCarrier","Origin","Dest"],[0.8,0.5,0.6]) df1=df.select(["Delay_feature","Year","DayofMonth","DayofWeek","DepTime","CRSDepTime","CRSArrTime","ONEHOT_UniqueCarrier","ActualElapsedTime","ONEHOT_Origin","ONEHOT_Dest","Distance"]) assembler =VectorAssembler(inputCols=["Year","DayofMonth","DayofWeek","DepTime","CRSDepTime","CRSArrTime","ONEHOT_UniqueCarrier","ActualElapsedTime","ONEHOT_Origin","ONEHOT_Dest","Distance"], outputCol="features") df2=assembler.transform(df1) df2=df2.withColumnRenamed("Delay_feature","label") df3=df2.select(["label","features"]) #df3.show() # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) model1=lr.fit(df3) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20} paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params.