featuresCol = 'features') dtClassifier_cv = CrossValidator(estimator = DecisionTreeClassifier(), estimatorParamMaps = paramGrid, evaluator = MulticlassClassificationEvaluator(), numFolds = 5) dt_model = dtClassifier_cv.fit(trainingData) predictions = dt_model.transform(testData) predictions.select('prediction', 'indexed', 'label', 'features').show() print('Results of Decision Tree: {}'.format(evaluator.evaluate(predictions))) trainingData.persist() rfClassifier = RandomForestClassifier(labelCol = 'indexed', featuresCol = 'features' ) param_rf = ParamGridBuilder().addGrid(RandomForestClassifier) param_rf = ParamGridBuilder().addGrid(RandomForestClassifier.maxDepth, [3, 4, 5, 6]).addGrid(RandomForestClassifier.minInstancesPerNode, [3, 5, 7, 9]).build() rfClassifier_cv = CrossValidator(estimator = RandomForestClassifier(), estimatorParamMaps = param_rf, evaluator = MulticlassClassificationEvaluator(), numFolds = 5) rf_model = rfClassifier_cv.fit(trainingData) prediction_rf = rf_model.transform(testData) prediction_rf.select('prediction', 'indexed', 'label', 'features').show() print('Results of Random Forest: {}'.format(evaluator.evaluate(predictions_rf))) ####### #Gradient boosting trees classifier: from pyspark.ml.classification import GBTClassifier
StructField("\"\"\"\"residual sugar\"\"\"\"", FloatType(), True), StructField("\"\"\"\"chlorides\"\"\"\"", FloatType(), True), StructField("\"\"\"\"free sulfur dioxide\"\"\"\"", FloatType(), True), StructField("\"\"\"\"total sulfur dioxide\"\"\"\"", FloatType(), True), StructField("\"\"\"\"density\"\"\"\"", FloatType(), True), StructField("\"\"\"\"pH\"\"\"\"", FloatType(), True), StructField("\"\"\"\"sulphates\"\"\"\"", FloatType(), True), StructField("\"\"\"\"alcohol\"\"\"\"", FloatType(), True), StructField("\"\"\"\"quality\"\"\"\"", FloatType(), True) ]) training = spark.read.format("csv").option("header", "true").option( "delimiter", ";").schema(schema).load("s3n://643-pa2/TrainingDataset.csv") vectorAssembler = VectorAssembler(inputCols=[ "\"\"\"\"\"fixed acidity\"\"\"\"", "\"\"\"\"volatile acidity\"\"\"\"", "\"\"\"\"citric acid\"\"\"\"", "\"\"\"\"residual sugar\"\"\"\"", "\"\"\"\"chlorides\"\"\"\"", "\"\"\"\"free sulfur dioxide\"\"\"\"", "\"\"\"\"total sulfur dioxide\"\"\"\"", "\"\"\"\"density\"\"\"\"", "\"\"\"\"pH\"\"\"\"", "\"\"\"\"sulphates\"\"\"\"", "\"\"\"\"alcohol\"\"\"\"" ], outputCol='features') training_data = vectorAssembler.transform(training) training_data = training_data.select(['features', "\"\"\"\"quality\"\"\"\""]) training_data.show(3) rf = RandomForestClassifier(labelCol="\"\"\"\"quality\"\"\"\"", featuresCol='features', maxDepth=10) #lr = LinearRegression(featuresCol = 'features', labelCol="\"\"\"\"quality\"\"\"\"", maxIter=10, regParam=0.3, elasticNetParam=0.8) model = rf.fit(training_data) model.save("s3n://643-pa2/TrainingModel.model")
print("\n\nPrinting Training Schema with Features Table\n\n") dataDF.printSchema() # Random Splitting of Data splitValue = 0.7 trainingDF, testDF = defTrain.randomSplit([splitValue, 1 - splitValue]) print("\nSplitted Data into Training and Testing Dataset\n") # Random Forest Regression on TrainingDataset rf = RandomForestClassifier(featuresCol='features', labelCol='""""quality"""""', numTrees=100, maxBins=484, maxDepth=25, minInstancesPerNode=5, seed=34) rfPipeline = Pipeline(stages=[assembler, rf]) rfPipelineModel = rfPipeline.fit(trainingDF) evaluator = RegressionEvaluator(labelCol='""""quality"""""', predictionCol="prediction", metricName="rmse") rfTrainingPredictions = rfPipelineModel.transform(defTrain) rfTestPredictions = rfPipelineModel.transform(testDF) print( "\nCompleted Model Training...\n\nRandom Forest RMSE on traning data = %g\n" % evaluator.evaluate(rfTrainingPredictions)) print("\nRandom Forest RMSE on test data = %g\n" %
rfreg.maxBins, [32, 100, 200]).build() # evaluator cont_eval = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='mse') ### For categorical outcomes # decision tree dt = DecisionTreeClassifier(labelCol='label', featuresCol='features') dt_pgrid = ParamGridBuilder().addGrid(dt.maxBins, [32, 80]).build() # random forest classifier rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=50) rf_pgrid = ParamGridBuilder().addGrid(rf.maxDepth, [3, 7]).addGrid(rf.maxBins, [32, 80]).build() # naive bayes classifier nb = NaiveBayes(labelCol='label', featuresCol='features') nb_pgrid = ParamGridBuilder().addGrid(nb.smoothing, [0, 0.3, 0.8]).build() # evaluator cat_eval = BinaryClassificationEvaluator(labelCol="label", metricName='areaUnderROC')
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features") final_df = assembler.transform(nwdf_no_strings) final_final_df = final_df.drop(*feature_columns).cache() # String indexing not required stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(final_final_df) td = si_model.transform(final_final_df) # Evaluators evaluator = MulticlassClassificationEvaluator(metricName='accuracy') eval = BinaryClassificationEvaluator() # RandomForest classifier rf = RandomForestClassifier(numTrees=100, maxDepth=16, labelCol="indexed", seed=42) model = rf.fit(td) result = model.transform(final_final_df) print('Accuracy on training data: ', evaluator.evaluate(result)) # Train test split for model evaluation train, test = final_final_df.randomSplit([0.7, 0.3], seed=12345) train.cache() test.cache() # --------------- # Random Forest: # --------------- rf = RandomForestClassifier(numTrees=100,
def featuresSelection(self, dataset_add, feature_colm, label_colm, relation_list, relation, userId, algoName): try: dataset = spark.read.parquet(dataset_add) #changing the relationship of the colm dataTransformationObj = DataTransformation(dataset=dataset) dataset = dataTransformationObj.colmTransformation( colmTransformationList=relation_list ) if relation == "non_linear" else dataset #transformation dataTransformationObj = DataTransformation(dataset=dataset) dataTransformationResult = dataTransformationObj.dataTranform( labelColm=label_colm, featuresColm=feature_colm) dataset = dataTransformationResult["dataset"] categoricalFeatures = dataTransformationResult[ "categoricalFeatures"] numericalFeatures = dataTransformationResult["numericalFeatures"] maxCategories = dataTransformationResult["maxCategories"] categoryColmStats = dataTransformationResult["categoryColmStats"] indexedFeatures = dataTransformationResult["indexedFeatures"] label = dataTransformationResult["label"] #statistics dataTransformationObj = DataTransformation(dataset=dataset) dataStatsResult = dataTransformationObj.dataStatistics( categoricalFeatures=categoricalFeatures, numericalFeatures=numericalFeatures) summaryDict = dataStatsResult # applying the algorithm ##calling the pearson test trainData, testData = dataset.randomSplit([0.80, 0.20], seed=40) if algoName == "random_regressor": statisticalTestObj = StatisticalTest( dataset=dataset, features=numericalFeatures, labelColm=label) statisticalTestResult = statisticalTestObj.pearsonTest() randomForestModel = RandomForestRegressor( labelCol=label, featuresCol='vec_indexed_features', numTrees=10, maxBins=maxCategories) keyStatsTest = "pearson_test_data" if algoName == "random_classifier": statisticalTestObj = StatisticalTest(dataset=dataset, features=indexedFeatures, labelColm=label) statisticalTestResult = statisticalTestObj.chiSquareTest( categoricalFeatures=categoricalFeatures, maxCategories=maxCategories) randomForestModel = RandomForestClassifier( labelCol=label, featuresCol='vec_indexed_features', numTrees=10, maxBins=maxCategories) keyStatsTest = "ChiSquareTestData" randomForestModelFit = randomForestModel.fit(trainData) # predictions = randomForestModelFit.transform(testData) print(randomForestModelFit.featureImportances) # feature_importance = randomForestModelFit.featureImportances.toArray().tolist() # print(feature_importance) import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') feature_importance = randomForestModelFit.featureImportances.toArray( ).tolist() print(feature_importance) featureImportance = [] for x in feature_importance: featureImportance.append(round(x, 4)) print(featureImportance) features_column_for_user = numericalFeatures + categoricalFeatures feature_imp = { 'feature_importance': featureImportance, "feature_column": features_column_for_user } response_dict = { 'feature_importance': feature_imp, keyStatsTest: statisticalTestResult, 'summaryDict': summaryDict, 'categoricalSummary': categoryColmStats } return response_dict except Exception as e: print(str(e))
# COMMAND ---------- numTreesList = [10, 25, 50] maxDepthList = [3, 10, 5] for numTrees, maxDepth in [(numTrees, maxDepth) for numTrees in numTreesList for maxDepth in maxDepthList]: params = { "numTrees": numTrees, "maxDepth": maxDepth, "model": "RandomForest" } params.update(dg_noise) params.update(model_data_date) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=numTrees, maxDepth=maxDepth) model, predictions, accuracy, ml_run_info = classificationModel( [labelIndexer, assembler, scaler, rf, labelConverter], params, train_data, test_data) print("Trees: %s, Depth: %s, Accuracy: %s\n" % (numTrees, maxDepth, accuracy)) # COMMAND ---------- # MAGIC %md # MAGIC ### Get Best Run and Metric from MLflow # COMMAND ---------- mlflow_experiment_id = ml_run_info.experiment_id
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat( features.Origin, lit('-'), features.Dest ) ) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Handle continuous, numeric fields by combining them into one feature vector numeric_columns = ["DepDelay", "Distance"] index_columns = ["Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index"] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("Run {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Evaluate average and STD of each metric # import numpy as np for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) print("AVG {} = {:.3f}".format(metric_name, average_accuracy)) std_accuracy = np.std(metric_scores) print("STD {} = {:.3f}".format(metric_name, std_accuracy)) # # Evaluate average and STD of each metric # import numpy as np score_averages = defaultdict(float) for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) print("AVG {} = {:.4f}".format(metric_name, average_accuracy)) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) print("STD {} = {:.4f}".format(metric_name, std_accuracy)) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = {metric_name: score_averages[metric_name] for metric_name in metric_names} # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] print("{} delta: {:.4f}".format(metric_name, run_delta)) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb"))
"fixedacidity", "volatileacidity", "citricacid", "residualsugar", "chlorides", "freesulfurdioxide", "totalsulfurdioxide", "density", "ph", "sulphates", "alcohol" ], outputCol='features') # ## Step 3 : Prepare Classifier ( Random Forest in this case ) from pyspark.ml import Pipeline from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.classification import RandomForestClassifier # ## Grid Search - Spark ML way # ### using Grid Search and cross validation from pyspark.ml.tuning import CrossValidator, ParamGridBuilder RFclassifier = RandomForestClassifier(labelCol='label', featuresCol='features', impurity=param_impurity) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, RFclassifier]) # ### Define test configutations (to be evaluated in Grid) paramGrid = ParamGridBuilder()\ .addGrid(RFclassifier.maxDepth, param_maxDepth )\ .addGrid(RFclassifier.numTrees, param_numTrees )\ .build() # ### Defing metric by wich the model will be evaluated evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC') crossval = CrossValidator( estimator=pipeline,
nb = NaiveBayes(smoothing=1) model = nb.fit(trainingData2) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") predictions = model.transform(testData2) result4 = evaluator.evaluate(predictions) print('naive bayes accuracy using TF-IDF features is : ' + str(result4)) ################################################################################################# ## random forest using count vectors features from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(labelCol="label", \ featuresCol="features", \ numTrees = 100, \ maxDepth = 4, \ maxBins = 32) # Train model with Training Data rfModel = rf.fit(trainingData1) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") predictions = rfModel.transform(testData1) result5 = evaluator.evaluate(predictions) print('random forest accuracy using count vectors features is: ' + str(result5)) ## random forest using TF-IDF features from pyspark.ml.classification import RandomForestClassifier
# COMMAND ---------- # Creating features lists to send to our indexers and vector assemblers # Then creating a random forest classifier and converting our predictions back to labels features = ['fire_class', 'discovery_month', 'discovery_year'] features2 = ['fire_size', 'latitude', 'longitude', 'vegetation', 'fire_magnitude', 'temp_discovery', 'wind_discovery', 'humid_discovery', 'precip_discovery', 'remoteness', 'tempBucket', 'windBucket', 'humidBucket', 'precipBucket', 'fire_class_index', 'discovery_month_index', 'discovery_year_index'] labelIndexer = StringIndexer(inputCol = 'fire_cause', outputCol = 'label').fit(train) featureIndexer = [StringIndexer(inputCol = column, outputCol = column + "_index").fit(train) for column in features] assembler = VectorAssembler(inputCols = features2, outputCol = "features") rf = RandomForestClassifier(labelCol = 'label', featuresCol = 'features', impurity='gini', maxDepth=10, numTrees=35, featureSubsetStrategy='auto') labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels = labelIndexer.labels) # Stages for our pipeline below stag2 = featureIndexer + [labelIndexer, tempBucketizer, windBucketizer, humidBucketizer, precipBucketizer, assembler, rf, labelConverter] # COMMAND ---------- pl2 = Pipeline(stages=stag2) # Training pipeline on the training data plTraining2 = pl2.fit(train) # Testing pipeline on the test data predTest2 = plTraining2.transform(test)
def randomForest(trainingData, testData, impurity, maxDepth, maxBins, numTrees, enableCrossValidator=False, featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0): print("\nInizio classificazione con RandomForestClassifier") # Inizializzo il modello del classificatore con i parametri in input (e quelli default) rfc = RandomForestClassifier(featuresCol=featuresCol, labelCol=labelCol, predictionCol=predictionCol, probabilityCol=probabilityCol, rawPredictionCol=rawPredictionCol, maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, minInfoGain=minInfoGain, maxMemoryInMB=maxMemoryInMB, cacheNodeIds=cacheNodeIds, checkpointInterval=checkpointInterval, impurity=impurity, numTrees=numTrees, featureSubsetStrategy=featureSubsetStrategy, seed=seed, subsamplingRate=subsamplingRate) print(" -modello creato") validator = None # In caso di cross validation if enableCrossValidator: # Creo la mappa dei parametri paramGrid = ParamGridBuilder().build() # Inizializzo l'evaluator evaluator = BinaryClassificationEvaluator() # Creo il sistema di k-fold cross validation, dove estiamtor è il classificatore da valutare e numFolds è il K crossVal = CrossValidator(estimator=rfc, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # use 3+ folds in practice validator = crossVal else: validator = rfc print(" -validator creato") training = trainingData.map(lambda x: (x[31], Vectors.dense(x[1:29]), x[ 30])).toDF(schema=['index', 'features', 'label']).orderBy('index') # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. # tokenizer = Tokenizer(inputCol="features", outputCol="transactions") # hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures", numFeatures=29) pipeline = Pipeline(stages=[validator]) model = pipeline.fit(training) print(" -modello addestrato con la pipeline (" + str(training.count()) + " elementi utilizzati come training)") test = testData.map(lambda x: (x[30], Vectors.dense(x[1:29]), x[31])).toDF( schema=['label', 'features', 'index']).orderBy('index') # prediction = predictions, label, index predictionsAndLabels = model.transform(test).rdd.map(lambda x: (x[5], x[0], x[2])) print(" -" + str(predictionsAndLabels.count()) + " elementi predetti (" + str(test.count()) + " elementi usati come test)") return predictionsAndLabels
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 설정 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.json".format(base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인 # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat(features.Origin, lit('-'), features.Dest)) features_with_route.show(6) # # pysmark.ml.feature.Bucketizer을 사용해 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간화 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature의 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # 원래 열을 제거 ml_bucketized_features = ml_bucketized_features.drop(column) # 파이프라인 모델을 저장 string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦 numeric_columns = ["DepDelay", "Distance"] index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # 숫자 벡터 어셈블러를 저장 vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # # 분류 모델을 교차 검증, 훈련, 평가: 4개의 지표에 대해 5회 반복 # from collections import defaultdict scores = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("Run {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # 테스트 데이터/훈련 데이터 분할 training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # 전체 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시킴 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, ) model = rfc.fit(training_data) # 예전 모델 대신 새 모델을 저장 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path) model.write().overwrite().save(model_output_path) # 테스트 데이터를 사용해서 모델을 평가 predictions = model.transform(test_data) # 각 지표에 대해 이 분할된 데이터의 결과를 평가 from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # 각 지표의 평균과 표준편차를 평가 # import numpy as np for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) print("AVG {} = {:.3f}".format(metric_name, average_accuracy)) std_accuracy = np.std(metric_scores) print("STD {} = {:.3f}".format(metric_name, std_accuracy)) # # 각 지표의 평균과 표준편차를 평가 # import numpy as np score_averages = defaultdict(float) for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) print("AVG {} = {:.4f}".format(metric_name, average_accuracy)) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) print("STD {} = {:.4f}".format(metric_name, std_accuracy)) # # 점수를 실행 사이에 존재하는 점수 로그에 유지 # import pickle # 점수 로그를 적재하거나 빈 로그를 초기화 try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # 기존 점수 로그 계산 score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # 각 지표에 대한 점수 변화를 계산하고 디스플레이 try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] print("{} delta: {:.4f}".format(metric_name, run_delta)) # 기존 평균 점수를 로그에 추가 score_log.append(score_log_entry) # 다음 번 실행을 위해 로그 유지 pickle.dump(score_log, open(score_log_filename, "wb"))
nn = MultilayerPerceptronClassifier().setLayers(layers).setLabelCol( 'speciesIndex').setFeaturesCol('features').setBlockSize( training_data.count()).setSeed(1234) model = nn.fit(training_data) classifications = model.transform(test_data) accuracy = evaluator.evaluate(classifications) print("Accuracy: %s" % accuracy) # ### Random Forest # In[19]: from pyspark.ml.classification import RandomForestClassifier # In[20]: rf = RandomForestClassifier().setLabelCol('speciesIndex').setFeaturesCol( 'features').setNumTrees(40) # In[21]: model = rf.fit(training_data) # In[22]: classifications = model.transform(test_data) accuracy = evaluator.evaluate(classifications) print("Accuracy: %s" % accuracy) # In[ ]:
predict_test.select("survived", "prediction").show() tp = predict_test.filter((col("survived") == 0) & (col("prediction") == 0)).count() tn = predict_test.filter((col("survived") == 1) & (col("prediction") == 1)).count() fp = predict_test.filter((col("survived") == 1) & (col("prediction") == 0)).count() fn = predict_test.filter((col("survived") == 0) & (col("prediction") == 1)).count() print(tp, tn, fp, fn) print("acc=", (tp + tn) / (tp + tn + fp + fn)) from pyspark.ml.classification import RandomForestClassifier dt = RandomForestClassifier(labelCol="Survived", featuresCol="features") dtmodel = dt.fit(train) predict_test = dtmodel.transform(test) predict_test.select("survived", "prediction").show() tp = predict_test.filter((col("survived") == 0) & (col("prediction") == 0)).count() tn = predict_test.filter((col("survived") == 1) & (col("prediction") == 1)).count() fp = predict_test.filter((col("survived") == 1) & (col("prediction") == 0)).count() fn = predict_test.filter((col("survived") == 0) & (col("prediction") == 1)).count() print(tp, tn, fp, fn) print("acc=", (tp + tn) / (tp + tn + fp + fn))
print("Area under PR curve: " + str(area_under_pr)) print("F1 score = %g" % f1_score) print("Accuracy = %g" % accuracy) print( "########################################################################" ) # Display the label and the prediction for the first 10 pairs. lr_result.select('label', 'pred').show(10) # ****************************************************************************** # # Run Random Forest Classification. # # ****************************************************************************** # rf = RandomForestClassifier(featuresCol='features', labelCol='label', predictionCol='pred', rawPredictionCol='pred_raw') rf_model = rf.fit(trainDF) rf_result = rf_model.transform(testDF) area_under_pr = evaluator1.evaluate(rf_result) f1_score = evaluator2.evaluate(rf_result) accuracy = evaluator3.evaluate(rf_result) print("") print( "########################################################################" ) print("RANDOM FOREST RESULTS") print("Area under PR curve: " + str(area_under_pr)) print("F1 score = %g" % f1_score)
labelCol='label', maxDepth=3) dtModel = dt.fit(train) predictions = dtModel.transform(test) predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10) #22 evaluator = BinaryClassificationEvaluator() print("Test Area Under ROC: " + str( evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}))) #23 from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(featuresCol='features', labelCol='label') rfModel = rf.fit(train) predictions = rfModel.transform(test) predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10) #24 evaluator = BinaryClassificationEvaluator() print("Test Area Under ROC: " + str( evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}))) #31 print(rf.explainParams()) #27 from pyspark.ml.classification import GBTClassifier
print('AUC:', BinaryClassificationMetrics(predictions['label','prediction'].rdd).areaUnderROC) bestModel = cvModel.bestModel #applicable to your model to pull list of all stages for x in range(len(bestModel.stages)): print(bestModel.stages[x]) print(bestModel.stages[3].extractParamMap()) # COMMAND ---------- #CV model of Random Forest Classifier from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator rf = ( RandomForestClassifier() .setFeaturesCol("features") .setLabelCol("label") ) from pyspark.ml import Pipeline pipeline = Pipeline().setStages([ ipindexer, # categorize internation_plan labelindexer, # categorize churn assembler, # assemble the feature vector for all columns rf]) pipelineModel = pipeline.fit(trainDF) numFolds = 3 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
split_col = pyspark.sql.functions.split(df['name'], '_') df = df.withColumn('patient', split_col.getItem(2)) # Select training patients at random number_of_training_patients = int( df.select('patient').distinct().count() * TRAINING_RATIO) training_patients = df.select('patient').distinct().orderBy( rand(seed=1)).limit(number_of_training_patients) # Divide into training and test data trainingData = df.join(training_patients, ['patient'], 'inner') testData = df.join(training_patients, ['patient'], 'leftanti') # Train a RandomForest model. rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=NUM_TREES, maxDepth=MAX_DEPTH) # Make pipeline from the stages pipeline = Pipeline(stages=[labelIndexer, features, rf]) # Train model model = pipeline.fit(trainingData) # Make prediction for the test set predictions = model.transform(testData) def get_metrics(predictions): auc = BinaryClassificationEvaluator().evaluate(predictions)
print 'Train Data Number of Row: '+ str(train.count()) print 'Validate Data Number of Row: '+ str(validate.count()) print 'Test Data Number of Row: '+ str(test.count()) # Apply Logsitic Regression from pyspark.ml.classification import LogisticRegression # regPara: regualrization parameter lr = LogisticRegression(maxIter = 100, regParam = 0.05, labelCol='index').fit(train) # Evaluate model based on auc ROC(default for binary classification) from pyspark.ml.evaluation import BinaryClassificationEvaluator def testModel(model, validate = validate): pred = model.transform(validate) evaluator = BinaryClassificationEvaluator(labelCol = 'index') return evaluator.evaluate(pred) from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier dt = DecisionTreeClassifier(maxDepth = 3, labelCol ='index').fit(train) rf = RandomForestClassifier(numTrees = 100, labelCol = 'index').fit(train) models = {'LogisticRegression':lr, 'DecistionTree':dt, 'RandomForest':rf} modelPerf = {k:testModel(v) for k,v in models.iteritems()}
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier from mmlspark.train import TrainClassifier import itertools lrHyperParams = [0.05, 0.2] logisticRegressions = [ LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams ] lrmodels = [ TrainClassifier(model=lrm, labelCol="label").fit(ptrain) for lrm in logisticRegressions ] rfHyperParams = itertools.product([5, 10], [2, 3]) randomForests = [ RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1]) for hyperParam in rfHyperParams ] rfmodels = [ TrainClassifier(model=rfm, labelCol="label").fit(ptrain) for rfm in randomForests ] gbtHyperParams = itertools.product([8, 16], [2, 3]) gbtclassifiers = [ GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1]) for hyperParam in gbtHyperParams ] gbtmodels = [ TrainClassifier(model=gbt, labelCol="label").fit(ptrain) for gbt in gbtclassifiers
metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Decision Tree model is %f' % evaluator1.evaluate(pred)) print('F1 score of Decision Tree model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context323"></a> # #### 3.2.3. Random Forest # In[19]: from pyspark.ml.classification import RandomForestClassifier # model on training data numTrees is the hyperparameter rfModel = RandomForestClassifier(numTrees=100).fit(trainData) # make prediction on test data pred = rfModel.transform(testData) pred.select('catLabel', 'label', 'prediction').show() evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC") evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Random Forest model is %f' % evaluator1.evaluate(pred)) print('F1 score of Random Forest model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose()
# COMMAND ---------- # MAGIC %md # MAGIC ####Random Forest # MAGIC # MAGIC Random Forests uses an ensemble of trees to improve model accuracy. # MAGIC # MAGIC You can read more about Random Forest from the programming guide [here](http://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests). # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier # Create an initial RandomForest model. rf = RandomForestClassifier(labelCol="label", featuresCol="features") # Train model with Training Data rfModel = rf.fit(trainingData) # COMMAND ---------- # Make predictions on test data using the Transformer.transform() method. predictions = rfModel.transform(testData) # COMMAND ---------- predictions.printSchema() # COMMAND ----------
# Indexar el campo `vehicle_color`: from pyspark.ml.feature import StringIndexer indexer = StringIndexer(inputCol="vehicle_color", outputCol="vehicle_color_indexed") # crear un dummy para la categorica de `vehicle_color_indexed`: from pyspark.ml.feature import OneHotEncoder encoder = OneHotEncoder(inputCol="vehicle_color_indexed", outputCol="vehicle_color_encoded") # seleccionar los features from pyspark.ml.feature import VectorAssembler features = ["reviewed", "vehicle_year", "vehicle_color_encoded", "CloudCover"] assembler = VectorAssembler(inputCols=features, outputCol="features") # especificar el estimador (i.e., classification algorithm): from pyspark.ml.classification import RandomForestClassifier classifier = RandomForestClassifier(featuresCol="features", labelCol="star_rating") print(classifier.explainParams()) # espeficar los valores en el grid de hiperparametros: from pyspark.ml.tuning import ParamGridBuilder maxDepthList = [5, 10, 20] numTreesList = [20, 50, 100] subsamplingRateList = [0.5, 1.0] paramGrid = ParamGridBuilder() \ .addGrid(classifier.maxDepth, maxDepthList) \ .addGrid(classifier.numTrees, numTreesList) \ .addGrid(classifier.subsamplingRate, subsamplingRateList) \ .build() # especificar el evaluador: from pyspark.ml.evaluation import MulticlassClassificationEvaluator
print( f" | recall = {metrics.recall()}\n | F1-Score = {metrics.fMeasure()}") conf_matrix = metrics.confusionMatrix().toArray() sns.set(font_scale=1.4) #for label size ax = sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 16}) ax.set(xlabel='Predicted Label', ylabel='True Label', title='Confusion Mtx') plt.show() # In[127]: #Random forest classifier model rando_forest = RandomForestClassifier(numTrees=10) rando_forest_model = rando_forest.fit(train) rando_forest_preds = rando_forest_model.transform(validation) custom_evaluation(rando_forest_preds, 'Random Forest') # In[128]: #Gradient boosted trees (ie ada boost) gbtrees = GBTClassifier(maxIter=10) gbtree_model = gbtrees.fit(train) gbtree_preds = gbtree_model.transform(validation) custom_evaluation(gbtree_preds, 'Gradient Boosted Trees') # In[129]: #SVM
def main(base_path): # Default to "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except (NameError, UnboundLocalError) as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat( features.Origin, lit('-'), features.Dest ) ) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the bucketizer ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear"] index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024 ) model = rfc.fit(final_vectorized_features) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator( predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy" ) accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # Check the distribution of predictions predictions.groupBy("Prediction").count().show() # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
df = df.drop(*drop_cols) df.cache() print("Creating Splits") train, test = df.randomSplit([0.7, 0.3]) print("Selected Features Count: {0}".format(len(selected_cols))) print("Selected Features: {0}".format(selected_cols)) print("Building Pipeline") hasher = FeatureHasher(numFeatures=1024, inputCols=selected_cols, outputCol="features", categoricalCols=selected_cols) forest = RandomForestClassifier(featuresCol="features", labelCol="HasDetections", predictionCol="prediction", probabilityCol="probability") pipeline = Pipeline(stages=[hasher, forest]) evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections", predictionCol="prediction", metricName="accuracy") print("Configuring Validation") params = ParamGridBuilder() \ .addGrid(hasher.numFeatures, [1024]) \ .addGrid(forest.maxDepth, [30]) \ .addGrid(forest.maxBins, [64]) \ .addGrid(forest.numTrees, [100]) \ .build()
["features", "label"]) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(trainData) trainData = labelIndexer.transform(trainData) label = labelIndexer.labels labelDict = {} for i in range(len(label)): labelDict[label[i]] = i labelValIndex = list(labelDict.items()) labelRdd = sc.parallelize(labelValIndex) labelDF = spark.createDataFrame(labelRdd, ['secID', 'index']) labelDF.write.save( 'hdfs://master:9000//fcd/completeLabelIndexer/labelIndexer_{}'. format(index), format='parquet', mode='append') # df = spark.read.format('parquet').load('hdfs://master:9000//sparkExperiment/labelIndexer/labelIndexer_60438') rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol='indexedLabel', featuresCol='features', seed=42) model1 = rf.fit(trainData) model1.save( 'hdfs://master:9000//fcd/completeModel/model_{}'.format(index)) end = time.time() print('训练花费时间: {}s'.format(end - start)) sc.stop()
# Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3], 13795) trainingData.persist() testData.persist() print("Number of training set rows: %d" % trainingData.count()) print("Number of test set rows: %d" % testData.count()) # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline( stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions.
label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label') plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed') input_cols=['intl_plan_indexed'] + reduced_numeric_cols assembler = VectorAssembler( inputCols = input_cols, outputCol = 'features') param_numTrees=int(sys.argv[1]) param_maxDepth=int(sys.argv[2]) param_impurity=sys.argv[3] from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier classifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'features', numTrees = param_numTrees, maxDepth = param_maxDepth, impurity = param_impurity) pipeline = Pipeline(stages=[plan_indexer, label_indexer, assembler, classifier]) (train, test) = churn_data.randomSplit([0.7, 0.3]) model = pipeline.fit(train) cdsw.track_metric("numTrees",param_numTrees) cdsw.track_metric("maxDepth",param_maxDepth) cdsw.track_metric("impurity",param_impurity) from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.sql.functions import udf predictions = model.transform(test) evaluator = BinaryClassificationEvaluator()