def test_fit_maximize_metric(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) bestModel = tvsModel.bestModel bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) validationMetrics = tvsModel.validationMetrics self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error") self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") self.assertEqual(len(grid), len(validationMetrics), "validationMetrics has the same size of grid parameter") self.assertEqual(1.0, max(validationMetrics))
def main(input_file): # Load and parse the data file, converting it to a DataFrame. data = MLUtils.loadLabeledPoints(sc, input_file) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestRegressor(featuresCol="indexedFeatures") # Chain indexer and forest in a Pipeline pipeline = Pipeline(stages=[featureIndexer, rf]) # Train model. This also runs the indexer. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse)) rfModel = model.stages[1] print(rfModel) # summary only
def test_fit_maximize_metric(self): sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"] ) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder().addGrid(iee.inducedError, [100.0, 0.0, 10000.0]).build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) bestModel = tvsModel.bestModel bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) self.assertEqual(0.0, bestModel.getOrDefault("inducedError"), "Best model should have zero induced error") self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
def test_fit_minimize_metric(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="rmse") grid = (ParamGridBuilder() .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) .build()) cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) bestModel = cvModel.bestModel bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error") self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
def test_java_params(self): """ This tests a bug fixed by SPARK-18274 which causes multiple copies of a Params instance in Python to be linked to the same Java instance. """ evaluator = RegressionEvaluator(metricName="r2") df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)]) evaluator.evaluate(df) self.assertEqual(evaluator._java_obj.getMetricName(), "r2") evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"}) evaluator.evaluate(df) evaluatorCopy.evaluate(df) self.assertEqual(evaluator._java_obj.getMetricName(), "r2") self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
import os df = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json") df_restaurants = df.filter("category = \"Restaurants\"") assembler = VectorAssembler( inputCols=["average_stars", "cat_avg_review_len", "cat_avg_stars", "cat_business_count", "cat_review_count", "months_yelping", "review_count", "votes_cool", "votes_funny", "votes_useful" ], outputCol="features") output = assembler.transform(df_restaurants) (trainingData, testData) = output.randomSplit([0.7, 0.3]) dt = DecisionTreeRegressor(labelCol = "elite", featuresCol="features") pipeline = Pipeline(stages=[dt]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.select("prediction", "elite", "features").show(5) evaluator = RegressionEvaluator( labelCol="elite", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
#VECTORIZE TRAIN DATA energi_habis_train = ssc.textFileStream("train_habis.txt") energi_habis_train_labeled = energi_habis_train.map(parse_train) energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"]) print(energi_habis_train_labeled_DF) #VECTORIZE TEST DATA energi_habis_test = ssc.textFileStream("test_habis.txt") energi_habis_test_labeled = energi_habis_test.map(parse_test) energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"]) print(energi_habis_test_labeled_DF) #Create Model numFeatures = 3 lr = LinearRegression(maxIter=50) lrModel = lr.fit(energi_habis_train_labeled_DF) #see what the model do print("Coefficients: "+str(lrModel.coefficients)) print("Intercept: "+str(lrModel.intercept)) #Predict On the tested data predictions = lrModel.transform(energi_habis_test_labeled_DF) predictions.select("prediction","label", "features").show() #Evaluate the predictions from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2") evaluator.evaluate(predictions)
# Make predictions predictionsA = modelA.transform(trainingData) print ('-'*70) print ('MODEL A : ') predictionsA.select("prediction", "label", "features").show(30) print ('-'*70) predictionsB = modelB.transform(trainingData) print ('-'*70) print ('MODEL B : ') predictionsB.select("prediction", "label", "features").show(30) print ('-'*70) # Evaluate the model evaluator = RegressionEvaluator(metricName="rmse") RMSE = evaluator.evaluate(predictionsA) print ('-'*70) print("ModelA: Root Mean Squared Error = " + str(RMSE)) print ('-'*70) # ModelA: Root Mean Squared Error = 128.602026843 RMSE = evaluator.evaluate(predictionsB) print ('-'*70) print("ModelB: Root Mean Squared Error = " + str(RMSE)) print ('-'*70) # ModelB: Root Mean Squared Error = 129.496300193
posTrain, posTest = pos.randomSplit([0.8, 0.2], seed=17) hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) from pyspark.ml import Pipeline from pyspark.ml.regression import LinearRegression from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.evaluation import RegressionEvaluator lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) lrPipeline = Pipeline(stages=[hashingTF, lr]) dt = DecisionTreeRegressor(maxDepth=10, maxBins=50) dtPipeline = Pipeline(stages=[hashingTF, dt]) rf = RandomForestRegressor(maxDepth=10, maxBins=50, numTrees=50) rfPipeline = Pipeline(stages=[hashingTF, rf]) posLR = lrPipeline.fit(posTrain) lrPred = posLR.transform(posTest) posDT = dtPipeline.fit(posTrain) dtPred = posDT.transform(posTest) posRF = rfPipeline.fit(posTrain) rfPred = posRF.transform(posTest) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") lr_rmse = evaluator.evaluate(lrPred) dt_rmse = evaluator.evaluate(dtPred) rf_rmse = evaluator.evaluate(rfPred) print("LR RMSE %g, DT RMSE %g, RF RMSE %g" % (lr_rmse, dt_rmse, rf_rmse)) # LR RMSE 0.44829, DT RMSE 0.312846, RF RMSE 0.300322
samples = df12.randomSplit([0.7, 0.3]) training = samples[0] test = samples[1] lr = LinearRegression(maxIter=5, regParam=0.3, labelCol="weight", featuresCol="features", predictionCol="predic_weight") model = lr.fit(training) print("결정계수(R2):%d" % model.summary.r2) d13 = model.transform(test) d13.cache() d13.select("weight", "predic_weight").show(5, False) evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight") # root mean squared error rmse = evaluator.evaluate(d13) # mean squared error mse = evaluator.setMetricName("mse").evaluate(d13) # R2 metric r2 = evaluator.setMetricName("r2").evaluate(d13) # mean absolute error mae = evaluator.setMetricName("mae").evaluate(d13) print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))
# MAGIC To start, we'll generate the predictions by using the first model in `petalModels`. # COMMAND ---------- petalPredictions = petalModels[0].transform(irisPetal) display(petalPredictions) # COMMAND ---------- # MAGIC %md # MAGIC Next, we'll evaluate the model using the `RegressionEvaluator`. # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator regEval = RegressionEvaluator().setLabelCol('petalWidth') print regEval.explainParams() # COMMAND ---------- # MAGIC %md # MAGIC The default value for `RegressionEvaluator` is root mean square error (RMSE). Let's view that first. # COMMAND ---------- print regEval.evaluate(petalPredictions) # COMMAND ---------- # MAGIC %md
modelprep1 = va.transform(enriched1).select('userId','movieId','rating','features') training, testing, other = modelprep1.randomSplit([0.07, 0.03, 0.90]) print '[ INFO ] Training: ' + str(training.count()) + ' records' print '[ INFO ] Testing: ' + str(training.count()) + ' records' gb = GBTRegressor(featuresCol="features", labelCol=var_target, predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345) gbmodel = gb.fit(training) #gbmodel.save('/tmp/spark_models/kaggle_bike_sharing_gb_model') predictions = gbmodel.transform(testing) print '[ INFO ] Printing predictions vs label...' predictions.show(10,False).select('prediction',var_target) evaluator = RegressionEvaluator(labelCol=var_target, predictionCol="prediction") print '[ INFO ] Model Fit (RMSE): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})) #print '[ INFO ] Model Fit (MSE): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "mse"})) #print '[ INFO ] Model Fit (R2): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "r2"})) total_runtime_seconds = (datetime.datetime.now() - start_time).seconds print '#'*100 print '[ INFO ] Total Runtime: ' + str(total_runtime_seconds) + ' seconds' print '#'*100 #ZEND
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error") self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0") def test_fit_maximize_metric(self): sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = (ParamGridBuilder() .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) .build()) cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) bestModel = cvModel.bestModel bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error") self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") if __name__ == "__main__":
#print("Dispersion: " + str(summary.dispersion)) #print("Null Deviance: " + str(summary.nullDeviance)) #print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull)) #print("Deviance: " + str(summary.deviance)) #print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom)) #print("AIC: " + str(summary.aic)) #print("Deviance Residuals: ") #summary.residuals().show() # Make predictions. predictions = glmmodel.transform(testing) # Select example rows to display. predictions.select("prediction", "label").show(30,False) evaluator = RegressionEvaluator(metricName="rmse") # rmse (default)|mse|r2|mae RMSE = evaluator.evaluate(predictions) print 'RMSE: ' + str(RMSE) ####################################################################################### # # Modeling - Gradient Boosting (Regression) # ####################################################################################### gbt = GBTRegressor(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345) #gbt = GBTClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345) gbtmodel = gbt.fit(training)
def spark_process(sqlContext, sc, validate, path_to_file): ###################### # # HDFS to DataFrame # ###################### ## all fields: # ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', # 'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', # 'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', # 'tolls_amount', 'total_amount'] # columns to select feature_columns = [1,2,3,5,6,9,10] # read file and convert to DataFrame # dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache() customSchema = StructType([ StructField("vendor_id", StringType(), True), StructField("pickup_datetime", TimestampType(), True), StructField("dropoff_datetime", TimestampType(), True), StructField("passenger_count", StringType(), True), StructField("trip_distance", StringType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("rate_code", StringType(), True), StructField("store_and_fwd_flag", StringType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True), StructField("payment_type", StringType(), True), StructField("fare_amount", StringType(), True), StructField("surcharge", StringType(), True), StructField("mta_tax", StringType(), True), StructField("tip_amount", StringType(), True), StructField("tolls_amount", StringType(), True), StructField("total_amount", StringType(), True) ]) dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file) # create dataframe with selected columns dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns)) # this number does not include the header # number_of_trips = dataframe.count() sqlContext.clearCache() ###################### # # Preprocess data # ###################### # filter rows with null fields # if passenger count is missing assign it a value of 1 # filter invalid location: keep only areas near NYC dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \ .fillna(1,subset=["passenger_count"]) \ .filter(dataframe.pickup_latitude>40.0) \ .filter(dataframe.pickup_latitude<41.0) \ .filter(dataframe.pickup_longitude<-73.0) \ .filter(dataframe.pickup_longitude>-74.0) \ .filter(dataframe.dropoff_latitude>40.0) \ .filter(dataframe.dropoff_latitude<41.0) \ .filter(dataframe.dropoff_longitude<-73.0)\ .filter(dataframe.dropoff_longitude>-74.0) ###################### # # features engineering # ###################### # create new column based on time-delta (minutes) # convert pickup-datetime column to hour time_delta_udf = udf(time_delta_minutes,FloatType()) dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \ .withColumn('pick_up_hour', hour(dataframe.pickup_datetime)) dataframe = dataframe.select(dataframe.pick_up_hour, \ dataframe.passenger_count.cast("integer"), \ dataframe.pickup_longitude.cast("double"), \ dataframe.pickup_latitude.cast("double"), \ dataframe.dropoff_longitude.cast("double"),\ dataframe.dropoff_latitude.cast("double"), \ dataframe.time_delta.cast("double")) dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache() # split dataframe into feature and label vector # create feature vectors and labels for model training feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features') transformed = feature_assembler.transform(dataframe) vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache() ###################### # # train model # ###################### if validate: ################################ # # validate model on 60/40 split # ################################ # split training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0) decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25) model = decision_tree_reg.fit(training) train_pred = model.transform(training) test_pred = model.transform(test) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_train = evaluator.evaluate(train_pred) evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2_test = evaluator_test.evaluate(test_pred) output = test_pred.select("prediction", "label", "features") return output, r2_test, r2_train else: ################### # # train on all data # ################### decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25) model = decision_tree_reg.fit(vector_dataframe) predictions = model.transform(vector_dataframe) output = predictions.select("prediction", "label", "features") ########################### # # process to send to Kafka # ########################### schema = StructType([StructField("prediction_mins", FloatType(), True), StructField("pick_up_hour", IntegerType(), True), StructField("pickup_longitude", DoubleType(), True), StructField("pickup_latitude", DoubleType(), True), StructField("dropoff_longitude", DoubleType(), True), StructField("dropoff_latitude", DoubleType(), True)]) features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect() sqlContext.clearCache() dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache() return dataframe_from_prediction_vector
# TRAIN WITH CROSS-VALIDATION #cv_model = cv.fit(trainDataFrame) cv_model = cv.fit(trainReg.toDF(['label','features'])) # EVALUATE MODEL ON TEST SET #testDataFrame = sqlContext.createDataFrame(oneHotTESTreg, ["features", "label"]) testDataFrame = testReg.toDF(['label','features']) # MAKE PREDICTIONS ON TEST DOCUMENTS # cvModel uses the best model found (lrModel). predictionAndLabels = cv_model.transform(testDataFrame) predictionAndLabels.select("features", "label", "prediction").show() # validate the results # metric to measure how well a fitted Model does on held-out test data evaluator = RegressionEvaluator(metricName="rmse") rmse = evaluator.evaluate(predictionAndLabels) print("Root-mean-square error = %s" % rmse) #### LOGISTIC REGRESSION
from pyspark.ml.recommendation import ALS # Let's initialize our ALS learner als = ALS() # Now we set the parameters for the method als.setMaxIter(5)\ .setSeed(seed)\ .setRegParam(0.1)\ .setUserCol("userId").setItemCol("movieId").setRatingCol("rating") # Now let's compute an evaluation metric for our test dataset from pyspark.ml.evaluation import RegressionEvaluator # Create an RMSE evaluator using the label and predicted columns reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse") tolerance = 0.03 ranks = [4, 8, 12] errors = [0, 0, 0] models = [0, 0, 0] err = 0 min_error = float('inf') best_rank = -1 for rank in ranks: # Set the rank here: als.setRank(rank) # Create the model with these parameters. model = als.fit(training_df) # Run the model to create a prediction. Predict against the validation_df. predict_df = model.transform(validation_df)
# random forest estimator rf = RandomForestRegressor(featuresCol="features",labelCol="relevance", maxDepth=5) # paramgrid, can add param of transformer using addGrid() paramGrid = ParamGridBuilder() \ .addGrid(rf.numTrees,[5]) \ .build() # cross validation # https://docs.databricks.com/spark/latest/mllib/binary-classification-mllib-pipelines.html from pyspark.ml.evaluation import RegressionEvaluator cv = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol="relevance"), trainRatio=0.8) # Run cross-validation, and choose the best set of parameters. model = cv.fit(train_feat) # rf = RandomForestRegressor(featuresCol="features",labelCol="relevance", numTrees=15, maxDepth=6) # model = rf.fit(train_feat) # prepare the test data following the same steps test = testdata.join(descrdata, col("product_uid2") == col("product_uid1"), 'left').drop("product_uid2") \ .withColumn("product_description", when(col("product_description").isNull(), "empty").otherwise(col("product_description"))) # remove special characters in all fields test = test.withColumn("product_description", regexp_replace('product_description', '[^a-zA-Z1-9\\s]', '')) \