Beispiel #1
0
    def test_fit_maximize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        bestModel = tvsModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
        validationMetrics = tvsModel.validationMetrics

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
        self.assertEqual(len(grid), len(validationMetrics),
                         "validationMetrics has the same size of grid parameter")
        self.assertEqual(1.0, max(validationMetrics))
def main(input_file):
    # Load and parse the data file, converting it to a DataFrame.
    data = MLUtils.loadLabeledPoints(sc, input_file)

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestRegressor(featuresCol="indexedFeatures")

    # Chain indexer and forest in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, rf])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse))

    rfModel = model.stages[1]
    print(rfModel)  # summary only
Beispiel #3
0
    def test_fit_maximize_metric(self):
        sqlContext = SQLContext(self.sc)
        dataset = sqlContext.createDataFrame(
            [(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]
        )

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder().addGrid(iee.inducedError, [100.0, 0.0, 10000.0]).build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        bestModel = tvsModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault("inducedError"), "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
Beispiel #4
0
    def test_fit_minimize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="rmse")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        bestModel = cvModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
Beispiel #5
0
 def test_java_params(self):
     """
     This tests a bug fixed by SPARK-18274 which causes multiple copies
     of a Params instance in Python to be linked to the same Java instance.
     """
     evaluator = RegressionEvaluator(metricName="r2")
     df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
     evaluator.evaluate(df)
     self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
     evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
     evaluator.evaluate(df)
     evaluatorCopy.evaluate(df)
     self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
     self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
import os

df = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json")

df_restaurants = df.filter("category = \"Restaurants\"")


assembler = VectorAssembler(
    inputCols=["average_stars", "cat_avg_review_len", "cat_avg_stars", "cat_business_count", "cat_review_count", "months_yelping", "review_count", "votes_cool", "votes_funny", "votes_useful" ],
    outputCol="features")
output = assembler.transform(df_restaurants)

(trainingData, testData) = output.randomSplit([0.7, 0.3])

dt = DecisionTreeRegressor(labelCol = "elite", featuresCol="features")
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)

predictions.select("prediction", "elite", "features").show(5)


evaluator = RegressionEvaluator(
    labelCol="elite", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print "Root Mean Squared Error (RMSE) on test data = %g" % rmse




#VECTORIZE TRAIN DATA
energi_habis_train = ssc.textFileStream("train_habis.txt")
energi_habis_train_labeled = energi_habis_train.map(parse_train)
energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"])
print(energi_habis_train_labeled_DF)

#VECTORIZE TEST DATA
energi_habis_test = ssc.textFileStream("test_habis.txt")
energi_habis_test_labeled = energi_habis_test.map(parse_test)
energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"])
print(energi_habis_test_labeled_DF)

#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_habis_train_labeled_DF)

#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

#Predict On the tested data
predictions = lrModel.transform(energi_habis_test_labeled_DF)
predictions.select("prediction","label", "features").show()

#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
# Make predictions
predictionsA = modelA.transform(trainingData)
print ('-'*70)
print ('MODEL A : ')
predictionsA.select("prediction", "label", "features").show(30)
print ('-'*70)

predictionsB = modelB.transform(trainingData)
print ('-'*70)
print ('MODEL B : ')
predictionsB.select("prediction", "label", "features").show(30)
print ('-'*70)

# Evaluate the model
evaluator = RegressionEvaluator(metricName="rmse")
RMSE = evaluator.evaluate(predictionsA)
print ('-'*70)
print("ModelA: Root Mean Squared Error = " + str(RMSE))
print ('-'*70)
# ModelA: Root Mean Squared Error = 128.602026843

RMSE = evaluator.evaluate(predictionsB)
print ('-'*70)
print("ModelB: Root Mean Squared Error = " + str(RMSE))
print ('-'*70)
# ModelB: Root Mean Squared Error = 129.496300193



posTrain, posTest = pos.randomSplit([0.8, 0.2], seed=17)
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrPipeline = Pipeline(stages=[hashingTF, lr])
dt = DecisionTreeRegressor(maxDepth=10, maxBins=50)
dtPipeline = Pipeline(stages=[hashingTF, dt])
rf = RandomForestRegressor(maxDepth=10, maxBins=50, numTrees=50)
rfPipeline = Pipeline(stages=[hashingTF, rf])

posLR = lrPipeline.fit(posTrain)
lrPred = posLR.transform(posTest)
posDT = dtPipeline.fit(posTrain)
dtPred = posDT.transform(posTest)
posRF = rfPipeline.fit(posTrain)
rfPred = posRF.transform(posTest)

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
lr_rmse = evaluator.evaluate(lrPred)
dt_rmse = evaluator.evaluate(dtPred)
rf_rmse = evaluator.evaluate(rfPred)
print("LR RMSE %g, DT RMSE %g, RF RMSE %g" % (lr_rmse, dt_rmse, rf_rmse))

# LR RMSE 0.44829, DT RMSE 0.312846, RF RMSE 0.300322
Beispiel #10
0
samples = df12.randomSplit([0.7, 0.3])
training = samples[0]
test = samples[1]

lr = LinearRegression(maxIter=5, regParam=0.3, labelCol="weight", featuresCol="features", predictionCol="predic_weight")

model = lr.fit(training)

print("결정계수(R2):%d" % model.summary.r2)

d13 = model.transform(test)
d13.cache()

d13.select("weight", "predic_weight").show(5, False)

evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight")

# root mean squared error
rmse = evaluator.evaluate(d13)

# mean squared error
mse = evaluator.setMetricName("mse").evaluate(d13)

# R2 metric
r2 = evaluator.setMetricName("r2").evaluate(d13)

# mean absolute error
mae = evaluator.setMetricName("mae").evaluate(d13)

print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))
Beispiel #11
0
# MAGIC To start, we'll generate the predictions by using the first model in `petalModels`.

# COMMAND ----------

petalPredictions = petalModels[0].transform(irisPetal)
display(petalPredictions)

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we'll evaluate the model using the `RegressionEvaluator`.

# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
regEval = RegressionEvaluator().setLabelCol('petalWidth')

print regEval.explainParams()

# COMMAND ----------

# MAGIC %md
# MAGIC The default value for `RegressionEvaluator` is root mean square error (RMSE).  Let's view that first.

# COMMAND ----------

print regEval.evaluate(petalPredictions)

# COMMAND ----------

# MAGIC %md
Beispiel #12
0
modelprep1 = va.transform(enriched1).select('userId','movieId','rating','features')

training, testing, other = modelprep1.randomSplit([0.07, 0.03, 0.90])

print '[ INFO ] Training:          ' + str(training.count()) + ' records'
print '[ INFO ] Testing:           ' + str(training.count()) + ' records'

gb = GBTRegressor(featuresCol="features", labelCol=var_target, predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)

gbmodel = gb.fit(training)
#gbmodel.save('/tmp/spark_models/kaggle_bike_sharing_gb_model')

predictions = gbmodel.transform(testing)

print '[ INFO ] Printing predictions vs label...'
predictions.show(10,False).select('prediction',var_target)

evaluator = RegressionEvaluator(labelCol=var_target, predictionCol="prediction")
print '[ INFO ] Model Fit (RMSE):  ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))
#print '[ INFO ] Model Fit (MSE):   ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "mse"}))
#print '[ INFO ] Model Fit (R2):    ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "r2"}))

total_runtime_seconds = (datetime.datetime.now() - start_time).seconds

print '#'*100
print '[ INFO ] Total Runtime:     ' + str(total_runtime_seconds) + ' seconds'
print '#'*100


#ZEND
Beispiel #13
0
        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")

    def test_fit_maximize_metric(self):
        sqlContext = SQLContext(self.sc)
        dataset = sqlContext.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        bestModel = cvModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")


if __name__ == "__main__":
Beispiel #14
0
#print("Dispersion: " + str(summary.dispersion))
#print("Null Deviance: " + str(summary.nullDeviance))
#print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
#print("Deviance: " + str(summary.deviance))
#print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
#print("AIC: " + str(summary.aic))
#print("Deviance Residuals: ")
#summary.residuals().show()

# Make predictions.
predictions = glmmodel.transform(testing)

# Select example rows to display.
predictions.select("prediction", "label").show(30,False)

evaluator = RegressionEvaluator(metricName="rmse")  # rmse (default)|mse|r2|mae
RMSE = evaluator.evaluate(predictions)
print 'RMSE: ' + str(RMSE)



#######################################################################################
#
#   Modeling - Gradient Boosting (Regression)
#
#######################################################################################

gbt = GBTRegressor(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)
#gbt = GBTClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)

gbtmodel = gbt.fit(training)
Beispiel #15
0
def spark_process(sqlContext, sc, validate, path_to_file):

	######################
	#
	# HDFS to DataFrame 
	#
	######################

	
	## all fields:
	#  ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 
	#   'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', 
	#   'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', 
	#   'tolls_amount', 'total_amount']

	# columns to select
	feature_columns = [1,2,3,5,6,9,10]

	# read file and convert to DataFrame
	# dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache()
	customSchema = StructType([
    							StructField("vendor_id", StringType(), True),
							    StructField("pickup_datetime", TimestampType(), True),
							    StructField("dropoff_datetime", TimestampType(), True),
							    StructField("passenger_count", StringType(), True),
							    StructField("trip_distance", StringType(), True),
							    StructField("pickup_longitude", DoubleType(), True),
							    StructField("pickup_latitude", DoubleType(), True),
							    StructField("rate_code", StringType(), True),
							    StructField("store_and_fwd_flag", StringType(), True),
							    StructField("dropoff_longitude", DoubleType(), True),
							    StructField("dropoff_latitude", DoubleType(), True),
							    StructField("payment_type", StringType(), True),
							    StructField("fare_amount", StringType(), True),
							    StructField("surcharge", StringType(), True),
							    StructField("mta_tax", StringType(), True),
							    StructField("tip_amount", StringType(), True),
							    StructField("tolls_amount", StringType(), True),
							    StructField("total_amount", StringType(), True)
							    ])

	dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file)
	# create dataframe with selected columns
	dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns))
	
	# this number does not include the header
	# number_of_trips = dataframe.count()

	sqlContext.clearCache()
	######################
	#
	# Preprocess data 
	#
	######################

	# filter rows with null fields
	# if passenger count is missing assign it a value of 1
	# filter invalid location: keep only areas near NYC
	dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \
						.fillna(1,subset=["passenger_count"])     \
						.filter(dataframe.pickup_latitude>40.0)   \
						.filter(dataframe.pickup_latitude<41.0)   \
						.filter(dataframe.pickup_longitude<-73.0) \
						.filter(dataframe.pickup_longitude>-74.0) \
						.filter(dataframe.dropoff_latitude>40.0)  \
						.filter(dataframe.dropoff_latitude<41.0)  \
						.filter(dataframe.dropoff_longitude<-73.0)\
						.filter(dataframe.dropoff_longitude>-74.0)


	######################
	#
	# features engineering
	#
	######################

	# create new column based on time-delta (minutes)
	# convert pickup-datetime column to hour
		
	time_delta_udf = udf(time_delta_minutes,FloatType())

	dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
						 .withColumn('pick_up_hour', hour(dataframe.pickup_datetime))

 	dataframe = dataframe.select(dataframe.pick_up_hour,    \
 								dataframe.passenger_count.cast("integer"),  \
								dataframe.pickup_longitude.cast("double"), \
								dataframe.pickup_latitude.cast("double"),  \
								dataframe.dropoff_longitude.cast("double"),\
								dataframe.dropoff_latitude.cast("double"), \
								dataframe.time_delta.cast("double"))

 	dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()


 	# split dataframe into feature and label vector
	# create feature vectors and labels for model training
	feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features')

	transformed = feature_assembler.transform(dataframe)
	vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache()

	######################
	#
	# train model
	#
	######################

	if validate:

		################################
		#
		# validate model on 60/40 split
		#
		################################

		# split 
		training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(training)

		train_pred = model.transform(training)
		test_pred = model.transform(test)

		evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_train = evaluator.evaluate(train_pred)

		evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_test = evaluator_test.evaluate(test_pred)

		output = test_pred.select("prediction", "label", "features")

		return output, r2_test, r2_train
	
	else:

		###################
		#
		# train on all data
		#
		###################

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(vector_dataframe)

		predictions = model.transform(vector_dataframe)

		output = predictions.select("prediction", "label", "features")

		###########################
		#
		# process to send to Kafka
		#
		###########################

		schema = StructType([StructField("prediction_mins", FloatType(), True),
							StructField("pick_up_hour", IntegerType(), True),
							StructField("pickup_longitude", DoubleType(), True),
							StructField("pickup_latitude", DoubleType(), True),
							StructField("dropoff_longitude", DoubleType(), True),
							StructField("dropoff_latitude", DoubleType(), True)])

		features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect()
		sqlContext.clearCache()
		dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache()

		return dataframe_from_prediction_vector
# TRAIN WITH CROSS-VALIDATION
#cv_model = cv.fit(trainDataFrame)
cv_model = cv.fit(trainReg.toDF(['label','features']))


# EVALUATE MODEL ON TEST SET
#testDataFrame = sqlContext.createDataFrame(oneHotTESTreg, ["features", "label"])
testDataFrame = testReg.toDF(['label','features'])

# MAKE PREDICTIONS ON TEST DOCUMENTS
# cvModel uses the best model found (lrModel).
predictionAndLabels = cv_model.transform(testDataFrame)
predictionAndLabels.select("features", "label", "prediction").show()

# validate the results
# metric to measure how well a fitted Model does on held-out test data
evaluator = RegressionEvaluator(metricName="rmse")
rmse = evaluator.evaluate(predictionAndLabels)
print("Root-mean-square error = %s" % rmse)


#### LOGISTIC REGRESSION







from pyspark.ml.recommendation import ALS

# Let's initialize our ALS learner
als = ALS()

# Now we set the parameters for the method
als.setMaxIter(5)\
   .setSeed(seed)\
   .setRegParam(0.1)\
   .setUserCol("userId").setItemCol("movieId").setRatingCol("rating")

# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

tolerance = 0.03
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
  # Set the rank here:
  als.setRank(rank)
  # Create the model with these parameters.
  model = als.fit(training_df)
  # Run the model to create a prediction. Predict against the validation_df.
  predict_df = model.transform(validation_df)
Beispiel #18
0
# random forest estimator
rf = RandomForestRegressor(featuresCol="features",labelCol="relevance", maxDepth=5)

 
# paramgrid, can add param of transformer using addGrid()
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees,[5]) \
    .build()
    

# cross validation
# https://docs.databricks.com/spark/latest/mllib/binary-classification-mllib-pipelines.html
from pyspark.ml.evaluation import RegressionEvaluator
cv = TrainValidationSplit(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(labelCol="relevance"),
                          trainRatio=0.8)  
# Run cross-validation, and choose the best set of parameters.
model = cv.fit(train_feat)



# rf = RandomForestRegressor(featuresCol="features",labelCol="relevance", numTrees=15, maxDepth=6)
# model = rf.fit(train_feat)

# prepare the test data following the same steps
test = testdata.join(descrdata, col("product_uid2") == col("product_uid1"), 'left').drop("product_uid2") \
    .withColumn("product_description", when(col("product_description").isNull(), "empty").otherwise(col("product_description")))

# remove special characters in all fields
test = test.withColumn("product_description", regexp_replace('product_description', '[^a-zA-Z1-9\\s]', '')) \