Ejemplo n.º 1
0
 def test_preserve_set_state(self):
     dataset = self.spark.createDataFrame([(0.5,)], ["data"])
     binarizer = Binarizer(inputCol="data")
     self.assertFalse(binarizer.isSet("threshold"))
     binarizer.transform(dataset)
     binarizer._transfer_params_from_java()
     self.assertFalse(binarizer.isSet("threshold"),
                      "Params not explicitly set should remain unset after transform")
Ejemplo n.º 2
0
 def test_preserve_set_state(self):
     dataset = self.spark.createDataFrame([(0.5,)], ["data"])
     binarizer = Binarizer(inputCol="data")
     self.assertFalse(binarizer.isSet("threshold"))
     binarizer.transform(dataset)
     binarizer._transfer_params_from_java()
     self.assertFalse(binarizer.isSet("threshold"),
                      "Params not explicitly set should remain unset after transform")
Ejemplo n.º 3
0
 def naiveOutliers(self, df, c):   
     binazer_2sdu = Binarizer(threshold = 2.0, inputCol=c, outputCol="2SDU_"+c)
     binazer_3sdu = Binarizer(threshold = 3.0, inputCol=c, outputCol="3SDU_"+c)
     binazer_2sdd = Binarizer(threshold = 2.0, inputCol=c, outputCol="2SDD_"+c)
     binazer_3sdd = Binarizer(threshold = 3.0, inputCol=c, outputCol="3SDD_"+c)   
      
     df = binazer_2sdu.transform(df.select('snapshotDate','ID',c))
     df = binazer_3sdu.transform(df)
     df = df.withColumn(c, -1.0 * df[c])
     df = binazer_2sdd.transform(df)
     df = binazer_3sdd.transform(df)
                      
     return( df.select('snapshotDate','ID','2SDU_'+c, '3SDU_'+c, '2SDD_'+c, '3SDD_'+c,) )
Ejemplo n.º 4
0
def performance(prediction):
    '''
    performance of model
    '''
    binarizer = Binarizer(threshold=0.5,
                          inputCol="prediction",
                          outputCol="b_prediction")
    binarizedDataFrame = binarizer.transform(prediction)
    binarizer = Binarizer(threshold=0.5, inputCol="label", outputCol="b_label")
    binarizedDataFrame = binarizer.transform(binarizedDataFrame)
    prediction_label = binarizedDataFrame.select('b_prediction', 'b_label')
    metrics = BinaryClassificationMetrics(prediction_label.rdd)
    return metrics.areaUnderROC
Ejemplo n.º 5
0
def best_model(algo, bin, log):
    mdl = algo.fit(trainingData)
    pred = mdl.transform(testData)
    if bin:
        bina = Binarizer(threshold=0.5,
                         inputCol="prediction_c",
                         outputCol="prediction")
        pred = bina.transform(pred)
    acc = evaluator_multi.evaluate(pred)
    area_under_curve = evaluator_bin.evaluate(pred)
    print("Accuracy:", acc)
    print("Area Under ROC:", area_under_curve)
    print("Top Three Features")
    if log:
        feature_importance = mdl.coefficients.values
        for f in np.abs(feature_importance).argsort()[-3:][::-1]:
            print(schemaNames[f + 1], end=" ")
        print("")

    else:
        feature_importance = mdl.featureImportances

        top_features = np.zeros(ncolumns - 1)
        top_features[feature_importance.indices] = feature_importance.values
        for f in top_features.argsort()[-3:][::-1]:
            print(schemaNames[f + 1], end=" ")
        print("")

    return best_model
Ejemplo n.º 6
0
    def test_model_binarizer(self):
        import numpy
        data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                          ["id", "feature"])
        model = Binarizer(inputCol='feature', outputCol='binarized')

        # the input name should match that of what StringIndexer.inputCol
        model_onnx = convert_sparkml(model, 'Sparkml Binarizer',
                                     [('feature', FloatTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("binarized").toPandas().values.astype(
            numpy.float32)
        data_np = data.select('feature').toPandas().values.astype(
            numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBinarizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['binarized'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 7
0
def findmodel(algo, bin, log):
    model = algo.fit(trainingData)
    predictions = model.transform(testData)
    if bin:
        binarizer = Binarizer(threshold=0.5,
                              inputCol="prediction_c",
                              outputCol="prediction")
        predictions = binarizer.transform(predictions)
    accuracy = evaluatorM.evaluate(predictions)
    auc = evaluatorB.evaluate(predictions)
    print("Accuracy:", accuracy)
    print("Area Under ROC:", auc)
    print("Top Features")
    if log:
        fi = model.coefficients.values
        for i in np.abs(fi).argsort()[-3:][::-1]:
            print(schemaNames[i + 1], end=" ")
        print("")

    else:
        fi = model.featureImportances

        imp_feat = np.zeros(ncolumns - 1)
        imp_feat[fi.indices] = fi.values
        for i in imp_feat.argsort()[-3:][::-1]:
            print(schemaNames[i + 1], end=" ")
        print("")

    return model
Ejemplo n.º 8
0
def binarization_by_threshold(dataFrame, threshold, inputCol):
    # 对连续值根据阈值threshold二值化
    binarizer = Binarizer(threshold=threshold,
                          inputCol=inputCol,
                          outputCol='%s_binarized' % (inputCol))
    binarizedDataFrame = binarizer.transform(dataFrame)
    print('Binarizer output with Threshold = %f' % binarizer.getThreshold())
    return binarizedDataFrame
Ejemplo n.º 9
0
 def test_default_params_transferred(self):
     dataset = self.spark.createDataFrame([(0.5,)], ["data"])
     binarizer = Binarizer(inputCol="data")
     # intentionally change the pyspark default, but don't set it
     binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
     result = binarizer.transform(dataset).select("my_default").collect()
     self.assertFalse(binarizer.isSet(binarizer.outputCol))
     self.assertEqual(result[0][0], 1.0)
Ejemplo n.º 10
0
 def test_default_params_transferred(self):
     dataset = self.spark.createDataFrame([(0.5, )], ["data"])
     binarizer = Binarizer(inputCol="data")
     # intentionally change the pyspark default, but don't set it
     binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
     result = binarizer.transform(dataset).select("my_default").collect()
     self.assertFalse(binarizer.isSet(binarizer.outputCol))
     self.assertEqual(result[0][0], 1.0)
def pre_processing(continuousDataFrame):
    binarizer = Binarizer(threshold=0.5,
                          inputCol="feature",
                          outputCol="binarized_feature")

    binarizedDataFrame = binarizer.transform(continuousDataFrame)

    print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
    binarizedDataFrame.show()
Ejemplo n.º 12
0
 def binarizer(self, df, column):
     """
     按指定阈值 二值化Binarizer
     """
     # 对连续值根据阈值threshold二值化
     binarizer = Binarizer(threshold=5.1,
                           inputCol=column,
                           outputCol=column + '_binarized_feature')
     binarizedDataFrame = binarizer.transform(df)
     print('Binarizer output with Threshold = %f' %
           binarizer.getThreshold())
     return binarizedDataFrame
def decisionTreeRegressor(data, ncolumns, schemaNames):
    from pyspark.ml import Pipeline
    from pyspark.ml.regression import DecisionTreeRegressor
    from pyspark.ml.tuning import ParamGridBuilder
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.tuning import CrossValidator
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.feature import Binarizer
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    import numpy as np
    import time

    binarizer = Binarizer(
        threshold=0.00001,
        inputCol="features",
        outputCol="binarized_features",
    )
    binarizedDataFrame = binarizer.transform(data)

    (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50)
    dtr = DecisionTreeRegressor(labelCol="label",
                                featuresCol="binarized_features",
                                maxDepth=10,
                                maxBins=10,
                                impurity='Variance')

    timer = ''
    start = time.time()
    cvModel = dtr.fit(trainingData)
    end = time.time()
    timer = ((end - start) / 60)

    prediction = cvModel.transform(testData)
    evaluator = RegressionEvaluator\
         (labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(prediction)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    areaUC = evaluator.evaluate(prediction)

    fi = cvModel.featureImportances
    imp_feat = np.zeros(ncolumns - 1)
    imp_feat[fi.indices] = fi.values
    x = np.arange(ncolumns - 1)
    idx = (-imp_feat).argsort()[:3]
    feat = []
    for i in idx:
        feat.append(schemaNames[i])

    return feat, rmse, areaUC, timer
Ejemplo n.º 14
0
def prep_data(sqlContext, data, drops):
    """Prepares date for ML. Preparation includes: making a label column (by the rule: naacess > 10),
	applying drops and transforming data into LabeledPoint"""

    binarizer = Binarizer(threshold=10.0, inputCol="naccess", outputCol="target")
    data = binarizer.transform(data)

    drops = drops.split(",")
    cols = [x for x in data.columns if x not in set(drops)]

    data = data.select(cols)

    labeled = label_data(data)
    preped_data = sqlContext.createDataFrame(labeled, ['features','label'])

    return preped_data
Ejemplo n.º 15
0
def prep_data(sqlContext, data, drops):
    """Prepares date for ML. Preparation includes: making a label column (by the rule: naacess > 10),
	applying drops and transforming data into LabeledPoint"""

    binarizer = Binarizer(threshold=10.0,
                          inputCol="naccess",
                          outputCol="target")
    data = binarizer.transform(data)

    drops = drops.split(",")
    cols = [x for x in data.columns if x not in set(drops)]

    data = data.select(cols)

    labeled = label_data(data)
    preped_data = sqlContext.createDataFrame(labeled, ['features', 'label'])

    return preped_data
Ejemplo n.º 16
0
from pyspark.ml.feature import Binarizer
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext("local", "samp")
sqlContext = SQLContext(sc)
continuousDF = sqlContext.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.8)],
                                          ['label', 'features'])
binarizer = Binarizer(inputCol="features",
                      outputCol="binarized_feature",
                      threshold=0.5)
binarizedDF = binarizer.transform(continuousDF)
for bf in binarizedDF.select("binarized_feature").take(3):
    print bf
"""OUTPUT 
Row(binarized_feature=0.0)
Row(binarized_feature=1.0)
Row(binarized_feature=1.0)
"""
Ejemplo n.º 17
0
print("Evaluating Oversampling dataset:")

lrModel = lr.fit(o_train)
prediction = lrModel.transform(o_test)
rmse = eval.evaluate(prediction)
print("RMSE: %.3f" % rmse)
r2 = eval.evaluate(prediction, {eval.metricName: "r2"})
print("r2: %.3f" % r2)

## Random Forest

## oversampling

print("Evaluating oversampling dataset Random Forest:")
binarizer = Binarizer(threshold=0, inputCol="label", outputCol="bin_labels")
train_bin = binarizer.transform(o_train)
test_bin = binarizer.transform(o_test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
dt = RandomForestClassifier(labelCol="bin_labels",
                            featuresCol="pcaFeatures",
                            numTrees=10,
                            maxDepth=30)
dtModel = dt.fit(train_bin)
predictions = dtModel.transform(test_bin)
accuracy = evaluator.evaluate(predictions)
print("LR Accuracy = %g " % accuracy)
print(
    'AUC:',
    BinaryClassificationMetrics(predictions['label',
                                            'prediction'].rdd).areaUnderROC)
paramGrid_regressor = ParamGridBuilder() \
    .addGrid(dt1.maxDepth, [5, 10, 30]) \
    .addGrid(dt1.maxBins, [20, 35, 40]) \
    .build()
crossval_regressor = CrossValidator(estimator=pipeline_regressor,
                                    estimatorParamMaps=paramGrid_regressor,
                                    evaluator=evaluator_regressor,
                                    numFolds=2,
                                    parallelism=3)

cvModel_regressor = crossval_regressor.fit(trainingData)
prediction_regressor = cvModel_regressor.transform(testData)
binarizer = Binarizer(threshold=0.5,
                      inputCol="prediction",
                      outputCol="binarized_prediction")
binarizedDataFrame = binarizer.transform(prediction_regressor)
P1 = binarizedDataFrame.drop('prediction')
binarizedDataFrame_1 = P1.withColumnRenamed("binarized_prediction",
                                            "prediction")
evaluator_regressor1 = MulticlassClassificationEvaluator\
      (labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_regressor = evaluator_regressor1.evaluate(binarizedDataFrame_1)
print("Accuracy of DecisionTreeRegressor= %g " % accuracy_regressor)

evaluator_regressor_area_under_curve = BinaryClassificationEvaluator(
    rawPredictionCol="prediction")
accuracy_regressor_area_under_curve = evaluator_regressor_area_under_curve.evaluate(
    prediction_regressor)
print("Area Under the curve on Decision Trees for Regression ",
      accuracy_regressor_area_under_curve)
print("Printing Parameter for DecisionTreeRegressor")
Ejemplo n.º 19
0
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Binarizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BinarizerExample")\
        .getOrCreate()

    # $example on$
    continuousDataFrame = spark.createDataFrame([
        (0, 0.1),
        (1, 0.8),
        (2, 0.2)
    ], ["id", "feature"])

    binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

    binarizedDataFrame = binarizer.transform(continuousDataFrame)

    print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
    binarizedDataFrame.show()
    # $example off$

    spark.stop()
Ejemplo n.º 20
0
                            handleInvalid="skip")
qty_df = qty_indexer.fit(ohe_df).transform(ohe_df)
qty_df.select("quantity", "quantity_indexed").display()

# COMMAND ----------

# MAGIC %md Binarization of continous numerical data.

# COMMAND ----------

from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=10,
                      inputCol="unit_price",
                      outputCol="binarized_price")
binarized_df = binarizer.transform(qty_df)
binarized_df.select("quantity", "binarized_price").display()

# COMMAND ----------

# MAGIC %md Transforming date/time columns

# COMMAND ----------

from pyspark.sql.functions import month

month_df = binarized_df.withColumn("invoice_month", month("invoice_time"))
month_indexer = StringIndexer(inputCol="invoice_month",
                              outputCol="month_indexed",
                              handleInvalid="skip")
month_df = month_indexer.fit(month_df).transform(month_df)
Ejemplo n.º 21
0
# pyspark.ml.feature module

#
from pyspark.ml.feature import Binarizer
df = sparksession.createDataFrame([(0.5,)], ["values"])
df.collect()
binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features")
df2 = binarizer.transform(df)
df2.dtypes
df.collect()
df2.collect()
binarizer.getOutputCol()

rawData.take(1)
binarizer2 = Binarizer(threshold=0.5, inputCol="srv_diff_host_rate", outputCol="features")
binarizer2.transform(rawData)

binarizer.explainParam('inputCol')
binarizer.inputCol
binarizer.params

rawData.select(['count']).show()


rawData.dtypes
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="y_label", outputCol='indexed_y_label')
model = stringIndexer.fit(rawData)
td = model.transform(rawData)
td.dtypes
Ejemplo n.º 22
0
df = df.na.drop()

# In[9]:

#df.count(),len(df.columns)

# Creating categorical variable: Let create a categorical variable to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1. We can create this categorical variable as a column in a DataFrame using Binarizer

# In[10]:

# binarizer = Binarizer(threshold=24.99999,inputCol="relative_humidity_3pm",outputCol="label")

binarizer = Binarizer(threshold=24.99999,
                      inputCol=target_col,
                      outputCol="label")
binarizedDF = binarizer.transform(df)

# In[11]:

#binarizedDF.describe()

# # Creating target variable named label

# The threshold argument specifies the threshold value for the variable, inputCol is the input column to read, and outputCol is the name of the new categorical column. The second line applies the Binarizer and creates a new DataFrame with the categorical column. We can look at the first four values in the new DataFrame:

# In[12]:

#binarizedDF.select("relative_humidity_3pm","label").show(4)

# The first row's humidity value is greater than 25% and the label is 1. The other humidity values are less than 25% and have labels equal to 0.
Ejemplo n.º 23
0
from pyspark.ml.feature import Binarizer
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("binarizer").master(
        "local").getOrCreate()

    dataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.3)],
                                      ["id", "feature"])

    binaizer = Binarizer(inputCol="feature",
                         outputCol="binarizer",
                         threshold=0.5)

    binarizerDataFrame = binaizer.transform(dataFrame)

    binarizerDataFrame.show()

    spark.stop()
Ejemplo n.º 24
0
def binarize_training_data(trainingRawData):
    binarizer = Binarizer() \
        .setInputCol("rating") \
        .setOutputCol("label") \
        .setThreshold(3.5)
    return binarizer.transform(trainingRawData)
# Useful transformation functions:
#binarizer: converts continuous variables to 1 / 0 depending on set threshold
#bucketizer: ^ similar but for multi-class problems
#MaxAbsScaler: rescale data between -1 and 1 range
#MinMaxSacler: rescale data between 0 and 1 range
#OneHotEncoder: encodes categorical column to binary vectors
#PCA: self explanatory
#StandardScaler: convert so mean = 0 and sd = 1

from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=500,
                      inputCol="Yearly Amount Spent",
                      outputCol="label")
binarizedDataFrame = binarizer.transform(final_data)
binarizedDataFrame = binarizedDataFrame.drop("Yearly Amount Spent")
binarizedDataFrame.show()

from pyspark.ml.classification import LogisticRegression

logReg = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
fitted_logReg = logReg.fit(binarizedDataFrame)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(fitted_logReg.coefficients))
print("Intercept: " + str(fitted_logReg.intercept))
#log_summary = fitted_logReg.summary()

# test set
predictions_and_labels = fitted_logReg.evaluate(fitted_logReg)

# ## Generate label

# We can treat `star_rating` as a continuous numerical label or an ordered
# categorical label:
filtered.groupBy("star_rating").count().orderBy("star_rating").show()

# Rather than try to predict each value, let us see if we can distinguish
# between five-star and non-five-star ratings.  We can use the
# [Binarizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
# to create our binary label:
from pyspark.ml.feature import Binarizer
converted = filtered.withColumn("star_rating", col("star_rating").cast("double"))
binarizer = Binarizer(inputCol="star_rating", outputCol="high_rating", threshold = 4.5)
labeled = binarizer.transform(converted)
labeled.crosstab("star_rating", "high_rating").show()

# **Note:** `Binarizer` does not like integer values, thus we had to convert to doubles.


# ## Extract, transform, and select features

# Create function to explore features:
def explore(df, feature, label, plot=True):
  from pyspark.sql.functions import count, mean
  aggregated = df.groupby(feature).agg(count(label), mean(label)).orderBy(feature)
  aggregated.show()
  if plot == True:
    pdf = aggregated.toPandas()
    pdf.plot.bar(x=pdf.columns[0], y=pdf.columns[2], capsize=5)
Ejemplo n.º 27
0
importance["features"] = features_col
importance = importance.sort_values(by='values', ascending=False)
importance = importance.reset_index(drop=True)
print('1st feature', importance.features[0], importance.values[0][0])
print('2nd feature', importance.features[1], importance.values[1][0])
print('3rd feature', importance.features[2], importance.values[2][0])


# train DTR

dtr_total = DecisionTreeRegressor(featuresCol='features', labelCol='label', predictionCol="bin_prediction", maxDepth=15, maxBins=16, seed=111)
r_model = dtr_total.fit(train_total)
predictions = r_model.transform(test_total)

binarizer = Binarizer(threshold=0.5, inputCol="bin_prediction", outputCol="prediction")
binarizedDataFrame = binarizer.transform(predictions)

accuracy = evaluator.evaluate(binarizedDataFrame)
print("\n")
print("Final DTR Accuracy = %g " % accuracy)
print('AUC:', BinaryClassificationMetrics(binarizedDataFrame['label','prediction'].rdd).areaUnderROC)
importance = pd.DataFrame(model.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(feature_names)
importance["features"] = features_col
importance = importance.sort_values(by='values', ascending=False)
importance = importance.reset_index(drop=True)
print('1st feature', importance.features[0], importance.values[0][0])
print('2nd feature', importance.features[1], importance.values[1][0])
print('3rd feature', importance.features[2], importance.values[2][0])

# train lr
Ejemplo n.º 28
0
    .appName("SQL-ML") \
    .getOrCreate()
"""
Spark机器学习之特征提取、选择、转换 : https://blog.csdn.net/cheng9981/article/details/63280665/
处理特征的算法,大致分为以下几组:
     1、提取:从“原始”数据提取特征
     2、转换:缩放,转换或修改要素
     3、选择:从一组较大的要素中选择一个子集
     4、局部敏感哈希(LSH):这类算法将特征变换的方面与其他算法相结合。
"""

df2 = spark.createDataFrame([(0.5, 0.3), (0.5, 0.7)], ["values1", "values2"])
binarizer2 = Binarizer(thresholds=[0.0, 1.0])
binarizer2.setInputCols(["values1",
                         "values2"]).setOutputCols(["output1", "output2"])
binarizer2.transform(df2).show()

from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame(
    [("Hi I heard about Spark".split(" "), ),
     ("I wish Java could use case classes".split(" "), ),
     ("Logistic regression models are neat".split(" "), )], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="text",
                    outputCol="result")
model = word2Vec.fit(documentDF)
# MAGIC Divide the hosts by whether their `review_scores_rating` is above 97.  Do this using the transformer `Binarizer` with the output column `high_rating`.  This should create the objects `binarizer` and the transformed DataFrame `transformedBinnedDF`.
# MAGIC
# MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/>&nbsp;**Hint:** Note that `Binarizer` is a transformer, so it does not have a `.fit()` method<br>
# MAGIC <img alt="Hint" title="Hint" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.3em" src="https://files.training.databricks.com/static/images/icon-light-bulb.svg"/>&nbsp;**Hint:** See the <a href="http://spark.apache.org/docs/latest/api/python/pyspark.ml.html?highlight=binarizer#pyspark.ml.feature.Binarizer" target="_blank">Binarizer Docs</a> for more details.</a>

# COMMAND ----------

from pyspark.ml.feature import Binarizer

# COMMAND ----------

# TODO
binarizer = Binarizer(threshold=0.97,
                      inputCol="review_scores_rating",
                      outputCol="high_rating")
transformedBinnedDF = binarizer.transform(airbnbDF)

display(transformedBinnedDF)

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.ml.feature import Binarizer

dbTest("ML1-P-05-01-01", True, type(binarizer) == type(Binarizer()))
dbTest("ML1-P-05-01-02", True,
       binarizer.getInputCol() == 'review_scores_rating')
dbTest("ML1-P-05-01-03", True, binarizer.getOutputCol() == 'high_rating')
dbTest("ML1-P-05-01-04", True, "high_rating" in transformedBinnedDF.columns)

print("Tests passed!")
ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

# COMMAND ----------

###Binarizer, takes the numerical inputs and converts them into binary output (0 and 1) with respect to the threshold provided
from pyspark.ml.feature import Binarizer

continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                            ["id", "feature"])

binarizer = Binarizer(threshold=0.5,
                      inputCol="feature",
                      outputCol="binarized_feature")

binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

# COMMAND ----------

###PCA is a statistical procedure used to reduce the vector's dimensions. This example reduces a 5 dimensional feature into a 3 dimensional pca feature
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]
df = spark.createDataFrame(data, ["features"])
Ejemplo n.º 31
0
zfill_cols = piv_df.columns

# Zero fill the pivoted values
df = df.fillna(0, subset=zfill_cols.remove('NO'))

Binarizing Day of Week
In a previous video, we saw that it was very unlikely for a home to list on the weekend. Let's create a new field that says if the house is listed for sale on a weekday or not. In this example there is a field called List_Day_of_Week that has Monday is labeled 1.0 and Sunday is 7.0. Let's convert this to a binary field with weekday being 0 and weekend being 1. We can use the pyspark feature transformer Binarizer to do this.

# Import transformer
from pyspark.ml.feature import Binarizer

# Create the transformer
binarizer = Binarizer(threshold=5.0,inputCol='List_Day_of_Week', outputCol='Listed_On_Weekend')

# Apply the transformation to df
df = binarizer.transform(df)

# Verify transformation
df[['List_Day_of_Week', 'Listed_On_Weekend']].show()

Bucketing
If you are a homeowner its very important if a house has 1, 2, 3 or 4 bedrooms. But like bathrooms, once you hit a certain point you don't really care whether the house has 7 or 8. This example we'll look at how to figure out where are some good value points to bucket.

from pyspark.ml.feature import Bucketizer

# Plot distribution of sample_df
sns.distplot(sample_df, axlabel='BEDROOMS')
plt.show()

# Create the bucket splits and bucketizer
splits = [ 0, 1, 2, 3, 4, 5, float('Inf')]
Ejemplo n.º 32
0
dtr = DecisionTreeRegressor(labelCol="label", featuresCol="features")
pipeline_dtr = Pipeline(stages=[dtr])
paramGrid_reg = ParamGridBuilder() \
    .addGrid(dtr.maxDepth, [5, 10, 30]) \
    .addGrid(dtr.maxBins, [20, 35, 40]) \
    .build()
crossval_reg = CrossValidator(estimator=pipeline_dtr,
                              estimatorParamMaps=paramGrid_reg,
                              evaluator=evaluator_reg,
                              numFolds=2)
cvModel_reg = crossval_reg.fit(trainingData)
prediction_reg = cvModel_reg.transform(testData)
binarizer = Binarizer(threshold=0.5,
                      inputCol="prediction",
                      outputCol="binarized_prediction")
binarizedDataFrame = binarizer.transform(prediction_reg)
binarized = binarizedDataFrame.drop('prediction')
bdf = binarized.withColumnRenamed('binarized_prediction', 'prediction')
bestModel_reg = cvModel_reg.bestModel
bestnewModel_reg = bestModel_reg.stages[0]
bestParams_reg = bestnewModel_reg.extractParamMap()
print("\n")
print("The best parameters for Decision Tree Regressor are...")
print("\n")
for x in bestParams_reg:
    print(x.name, bestParams_reg[x])
print("\n")
print("Printing the selected training parameters... ")
print("\n")
maxDepth_dtr = bestnewModel_reg._java_obj.getMaxDepth()
print("Best maxDepth = ", maxDepth_dtr)
Ejemplo n.º 33
0
        StructField('TaxiOut',  DoubleType(), True),
        StructField('Cancelled',  IntegerType(), True),
        StructField('CancellationCode',  StringType(), True),
        StructField('Diverted',  IntegerType(), True),
        StructField('CarrierDelay', DoubleType(), True),
        StructField('WeatherDelay',  DoubleType(), True),
        StructField('NASDelay',  DoubleType(), True),
        StructField('SecurityDelay',  DoubleType(), True),
        StructField('LateAircraftDelay',  DoubleType(), True)
    ])
air = spark.read.options(header='true').schema(schema_sdf).csv("/home/devel/2020210973chenxiao/airdelay_small.csv")
#air2=air.na.drop()
air1 = air.select(["ArrDelay","Year","DayofMonth","DayofWeek","DepTime","CRSDepTime","CRSArrTime","UniqueCarrier","ActualElapsedTime","Origin","Dest","Distance"])
air3=air1.na.drop()
binarizer = Binarizer(threshold=0, inputCol="ArrDelay", outputCol="Delay_feature")
air2 = binarizer.transform(air3)

df=get_sdummies(air2,["UniqueCarrier","Origin","Dest"],[0.8,0.5,0.6])
df1=df.select(["Delay_feature","Year","DayofMonth","DayofWeek","DepTime","CRSDepTime","CRSArrTime","ONEHOT_UniqueCarrier","ActualElapsedTime","ONEHOT_Origin","ONEHOT_Dest","Distance"])
assembler =VectorAssembler(inputCols=["Year","DayofMonth","DayofWeek","DepTime","CRSDepTime","CRSArrTime","ONEHOT_UniqueCarrier","ActualElapsedTime","ONEHOT_Origin","ONEHOT_Dest","Distance"], outputCol="features")
df2=assembler.transform(df1)
df2=df2.withColumnRenamed("Delay_feature","label")
df3=df2.select(["label","features"])
#df3.show()
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
model1=lr.fit(df3)
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30  # Specify 1 Param, overwriting the original maxIter.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})  # Specify multiple Params.