featuresCol = 'features')

dtClassifier_cv = CrossValidator(estimator = DecisionTreeClassifier(),
	estimatorParamMaps = paramGrid,
	evaluator = MulticlassClassificationEvaluator(),
	numFolds = 5)

dt_model = dtClassifier_cv.fit(trainingData)
predictions = dt_model.transform(testData)
predictions.select('prediction', 'indexed', 'label', 'features').show()

print('Results of Decision Tree: {}'.format(evaluator.evaluate(predictions)))

trainingData.persist()
rfClassifier = RandomForestClassifier(labelCol = 'indexed',
	featuresCol = 'features'
	)
param_rf = ParamGridBuilder().addGrid(RandomForestClassifier)
param_rf = ParamGridBuilder().addGrid(RandomForestClassifier.maxDepth, [3, 4, 5, 6]).addGrid(RandomForestClassifier.minInstancesPerNode, [3, 5, 7, 9]).build()
rfClassifier_cv = CrossValidator(estimator = RandomForestClassifier(),
	estimatorParamMaps = param_rf,
	evaluator = MulticlassClassificationEvaluator(), numFolds = 5)
rf_model = rfClassifier_cv.fit(trainingData)
prediction_rf = rf_model.transform(testData)
prediction_rf.select('prediction', 'indexed', 'label', 'features').show()

print('Results of Random Forest: {}'.format(evaluator.evaluate(predictions_rf)))

#######
#Gradient boosting trees classifier:
from pyspark.ml.classification import GBTClassifier  
Ejemplo n.º 2
0
    StructField("\"\"\"\"residual sugar\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"chlorides\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"free sulfur dioxide\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"total sulfur dioxide\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"density\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"pH\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"sulphates\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"alcohol\"\"\"\"", FloatType(), True),
    StructField("\"\"\"\"quality\"\"\"\"", FloatType(), True)
])
training = spark.read.format("csv").option("header", "true").option(
    "delimiter", ";").schema(schema).load("s3n://643-pa2/TrainingDataset.csv")
vectorAssembler = VectorAssembler(inputCols=[
    "\"\"\"\"\"fixed acidity\"\"\"\"", "\"\"\"\"volatile acidity\"\"\"\"",
    "\"\"\"\"citric acid\"\"\"\"", "\"\"\"\"residual sugar\"\"\"\"",
    "\"\"\"\"chlorides\"\"\"\"", "\"\"\"\"free sulfur dioxide\"\"\"\"",
    "\"\"\"\"total sulfur dioxide\"\"\"\"", "\"\"\"\"density\"\"\"\"",
    "\"\"\"\"pH\"\"\"\"", "\"\"\"\"sulphates\"\"\"\"",
    "\"\"\"\"alcohol\"\"\"\""
],
                                  outputCol='features')
training_data = vectorAssembler.transform(training)
training_data = training_data.select(['features', "\"\"\"\"quality\"\"\"\""])
training_data.show(3)
rf = RandomForestClassifier(labelCol="\"\"\"\"quality\"\"\"\"",
                            featuresCol='features',
                            maxDepth=10)
#lr = LinearRegression(featuresCol = 'features', labelCol="\"\"\"\"quality\"\"\"\"", maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = rf.fit(training_data)
model.save("s3n://643-pa2/TrainingModel.model")
print("\n\nPrinting Training Schema with Features Table\n\n")
dataDF.printSchema()

# Random Splitting of Data

splitValue = 0.7
trainingDF, testDF = defTrain.randomSplit([splitValue, 1 - splitValue])
print("\nSplitted Data into Training and Testing Dataset\n")

# Random Forest Regression on TrainingDataset

rf = RandomForestClassifier(featuresCol='features',
                            labelCol='""""quality"""""',
                            numTrees=100,
                            maxBins=484,
                            maxDepth=25,
                            minInstancesPerNode=5,
                            seed=34)
rfPipeline = Pipeline(stages=[assembler, rf])
rfPipelineModel = rfPipeline.fit(trainingDF)
evaluator = RegressionEvaluator(labelCol='""""quality"""""',
                                predictionCol="prediction",
                                metricName="rmse")
rfTrainingPredictions = rfPipelineModel.transform(defTrain)
rfTestPredictions = rfPipelineModel.transform(testDF)

print(
    "\nCompleted Model Training...\n\nRandom Forest RMSE on traning data = %g\n"
    % evaluator.evaluate(rfTrainingPredictions))
print("\nRandom Forest RMSE on test data = %g\n" %
    rfreg.maxBins, [32, 100, 200]).build()

# evaluator
cont_eval = RegressionEvaluator(labelCol='label',
                                predictionCol='prediction',
                                metricName='mse')

### For categorical outcomes
# decision tree
dt = DecisionTreeClassifier(labelCol='label', featuresCol='features')

dt_pgrid = ParamGridBuilder().addGrid(dt.maxBins, [32, 80]).build()

# random forest classifier
rf = RandomForestClassifier(labelCol='label',
                            featuresCol='features',
                            numTrees=50)

rf_pgrid = ParamGridBuilder().addGrid(rf.maxDepth,
                                      [3, 7]).addGrid(rf.maxBins,
                                                      [32, 80]).build()

# naive bayes classifier
nb = NaiveBayes(labelCol='label', featuresCol='features')

nb_pgrid = ParamGridBuilder().addGrid(nb.smoothing, [0, 0.3, 0.8]).build()

# evaluator
cat_eval = BinaryClassificationEvaluator(labelCol="label",
                                         metricName='areaUnderROC')
Ejemplo n.º 5
0
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
final_df = assembler.transform(nwdf_no_strings)
final_final_df = final_df.drop(*feature_columns).cache()

# String indexing not required
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(final_final_df)
td = si_model.transform(final_final_df)

# Evaluators
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
eval = BinaryClassificationEvaluator()

# RandomForest classifier
rf = RandomForestClassifier(numTrees=100,
                            maxDepth=16,
                            labelCol="indexed",
                            seed=42)
model = rf.fit(td)
result = model.transform(final_final_df)
print('Accuracy on training data: ', evaluator.evaluate(result))

# Train test split for model evaluation
train, test = final_final_df.randomSplit([0.7, 0.3], seed=12345)
train.cache()
test.cache()

# ---------------
# Random Forest:
# ---------------

rf = RandomForestClassifier(numTrees=100,
    def featuresSelection(self, dataset_add, feature_colm, label_colm,
                          relation_list, relation, userId, algoName):
        try:
            dataset = spark.read.parquet(dataset_add)

            #changing the relationship of the colm
            dataTransformationObj = DataTransformation(dataset=dataset)
            dataset = dataTransformationObj.colmTransformation(
                colmTransformationList=relation_list
            ) if relation == "non_linear" else dataset
            #transformation
            dataTransformationObj = DataTransformation(dataset=dataset)
            dataTransformationResult = dataTransformationObj.dataTranform(
                labelColm=label_colm, featuresColm=feature_colm)
            dataset = dataTransformationResult["dataset"]
            categoricalFeatures = dataTransformationResult[
                "categoricalFeatures"]
            numericalFeatures = dataTransformationResult["numericalFeatures"]
            maxCategories = dataTransformationResult["maxCategories"]
            categoryColmStats = dataTransformationResult["categoryColmStats"]
            indexedFeatures = dataTransformationResult["indexedFeatures"]
            label = dataTransformationResult["label"]
            #statistics
            dataTransformationObj = DataTransformation(dataset=dataset)
            dataStatsResult = dataTransformationObj.dataStatistics(
                categoricalFeatures=categoricalFeatures,
                numericalFeatures=numericalFeatures)
            summaryDict = dataStatsResult

            # applying the algorithm
            ##calling the pearson test
            trainData, testData = dataset.randomSplit([0.80, 0.20], seed=40)

            if algoName == "random_regressor":
                statisticalTestObj = StatisticalTest(
                    dataset=dataset,
                    features=numericalFeatures,
                    labelColm=label)
                statisticalTestResult = statisticalTestObj.pearsonTest()
                randomForestModel = RandomForestRegressor(
                    labelCol=label,
                    featuresCol='vec_indexed_features',
                    numTrees=10,
                    maxBins=maxCategories)
                keyStatsTest = "pearson_test_data"
            if algoName == "random_classifier":
                statisticalTestObj = StatisticalTest(dataset=dataset,
                                                     features=indexedFeatures,
                                                     labelColm=label)
                statisticalTestResult = statisticalTestObj.chiSquareTest(
                    categoricalFeatures=categoricalFeatures,
                    maxCategories=maxCategories)
                randomForestModel = RandomForestClassifier(
                    labelCol=label,
                    featuresCol='vec_indexed_features',
                    numTrees=10,
                    maxBins=maxCategories)
                keyStatsTest = "ChiSquareTestData"
            randomForestModelFit = randomForestModel.fit(trainData)
            # predictions = randomForestModelFit.transform(testData)
            print(randomForestModelFit.featureImportances)
            # feature_importance = randomForestModelFit.featureImportances.toArray().tolist()
            # print(feature_importance)
            import pyspark.sql.functions as F
            import builtins
            round = getattr(builtins, 'round')
            feature_importance = randomForestModelFit.featureImportances.toArray(
            ).tolist()
            print(feature_importance)
            featureImportance = []
            for x in feature_importance:
                featureImportance.append(round(x, 4))
            print(featureImportance)

            features_column_for_user = numericalFeatures + categoricalFeatures
            feature_imp = {
                'feature_importance': featureImportance,
                "feature_column": features_column_for_user
            }

            response_dict = {
                'feature_importance': feature_imp,
                keyStatsTest: statisticalTestResult,
                'summaryDict': summaryDict,
                'categoricalSummary': categoryColmStats
            }
            return response_dict

        except Exception as e:
            print(str(e))
Ejemplo n.º 7
0
# COMMAND ----------

numTreesList = [10, 25, 50]
maxDepthList = [3, 10, 5]
for numTrees, maxDepth in [(numTrees, maxDepth) for numTrees in numTreesList
                           for maxDepth in maxDepthList]:
    params = {
        "numTrees": numTrees,
        "maxDepth": maxDepth,
        "model": "RandomForest"
    }
    params.update(dg_noise)
    params.update(model_data_date)
    rf = RandomForestClassifier(labelCol="indexedLabel",
                                featuresCol="features",
                                numTrees=numTrees,
                                maxDepth=maxDepth)
    model, predictions, accuracy, ml_run_info = classificationModel(
        [labelIndexer, assembler, scaler, rf, labelConverter], params,
        train_data, test_data)
    print("Trees: %s, Depth: %s, Accuracy: %s\n" %
          (numTrees, maxDepth, accuracy))

# COMMAND ----------

# MAGIC %md
# MAGIC ### Get Best Run and Metric from MLflow

# COMMAND ----------

mlflow_experiment_id = ml_run_info.experiment_id
Ejemplo n.º 8
0
def main(base_path):
  
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()

  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),     # "ArrDelay":5.0
    StructField("CRSArrTime", TimestampType(), True),    # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
    StructField("CRSDepTime", TimestampType(), True),    # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
    StructField("Carrier", StringType(), True),     # "Carrier":"WN"
    StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31
    StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
    StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
    StructField("DepDelay", DoubleType(), True),     # "DepDelay":14.0
    StructField("Dest", StringType(), True),        # "Dest":"SAN"
    StructField("Distance", DoubleType(), True),     # "Distance":368.0
    StructField("FlightDate", DateType(), True),    # "FlightDate":"2015-12-30T16:00:00.000-08:00"
    StructField("FlightNum", StringType(), True),   # "FlightNum":"6109"
    StructField("Origin", StringType(), True),      # "Origin":"TUS"
  ])
  
  input_path = "{}/data/simple_flight_delay_features.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print(list(cols_with_nulls))
  
  #
  # Add a Route variable to replace FlightNum
  #
  from pyspark.sql.functions import lit, concat
  features_with_route = features.withColumn(
    'Route',
    concat(
      features.Origin,
      lit('-'),
      features.Dest
    )
  )
  features_with_route.show(6)
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer

  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )

  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Drop the original column
    ml_bucketized_features = ml_bucketized_features.drop(column)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Handle continuous, numeric fields by combining them into one feature vector
  numeric_columns = ["DepDelay", "Distance"]
  index_columns = ["Carrier_index", "DayOfMonth_index",
                   "DayOfWeek_index", "DayOfYear_index", "Origin_index",
                   "Origin_index", "Dest_index", "Route_index"]
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)

  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #

  from collections import defaultdict
  scores = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3

  for i in range(1, split_count + 1):
    print("Run {} out of {} of test/train splits in cross validation...".format(
        i,
        split_count,
      )
    )
  
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
  
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4657,
    )
    model = rfc.fit(training_data)
  
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
  
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)

      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))

  #
  # Evaluate average and STD of each metric
  #
  import numpy as np
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    print("AVG {} = {:.3f}".format(metric_name, average_accuracy))
  
    std_accuracy = np.std(metric_scores)
    print("STD {} = {:.3f}".format(metric_name, std_accuracy))

  #
  # Evaluate average and STD of each metric
  #
  import numpy as np
  score_averages = defaultdict(float)

  for metric_name in metric_names:
    metric_scores = scores[metric_name]
  
    average_accuracy = sum(metric_scores) / len(metric_scores)
    print("AVG {} = {:.4f}".format(metric_name, average_accuracy))
    score_averages[metric_name] = average_accuracy
  
    std_accuracy = np.std(metric_scores)
    print("STD {} = {:.4f}".format(metric_name, std_accuracy))

  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle

  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []

  # Compute the existing score log entry
  score_log_entry = {metric_name: score_averages[metric_name] for metric_name in metric_names}

  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry

  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    print("{} delta: {:.4f}".format(metric_name, run_delta))

  # Append the existing average scores to the log
  score_log.append(score_log_entry)

  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
Ejemplo n.º 9
0
    "fixedacidity", "volatileacidity", "citricacid", "residualsugar",
    "chlorides", "freesulfurdioxide", "totalsulfurdioxide", "density", "ph",
    "sulphates", "alcohol"
],
                                 outputCol='features')

# ## Step 3 : Prepare Classifier ( Random Forest in this case )
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

# ## Grid Search - Spark ML way
# ### using Grid Search and cross validation
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
RFclassifier = RandomForestClassifier(labelCol='label',
                                      featuresCol='features',
                                      impurity=param_impurity)

pipeline = Pipeline(stages=[labelIndexer, featureIndexer, RFclassifier])

# ### Define test configutations (to be evaluated in Grid)
paramGrid = ParamGridBuilder()\
   .addGrid(RFclassifier.maxDepth, param_maxDepth )\
   .addGrid(RFclassifier.numTrees, param_numTrees )\
   .build()

# ### Defing metric by wich the model will be evaluated
evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')

crossval = CrossValidator(
    estimator=pipeline,
Ejemplo n.º 10
0
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData2)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

predictions = model.transform(testData2)
result4 = evaluator.evaluate(predictions)
print('naive bayes accuracy using TF-IDF features is : ' + str(result4))
#################################################################################################

## random forest using count vectors features
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(trainingData1)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

predictions = rfModel.transform(testData1)
result5 = evaluator.evaluate(predictions)
print('random forest accuracy using count vectors features is: ' +
      str(result5))

## random forest using TF-IDF features
from pyspark.ml.classification import RandomForestClassifier
Ejemplo n.º 11
0
# COMMAND ----------

# Creating features lists to send to our indexers and vector assemblers
# Then creating a random forest classifier and converting our predictions back to labels
features = ['fire_class', 'discovery_month', 'discovery_year']
features2 = ['fire_size', 'latitude', 'longitude', 'vegetation', 'fire_magnitude', 'temp_discovery', 'wind_discovery', 'humid_discovery',
             'precip_discovery', 'remoteness', 'tempBucket', 'windBucket', 'humidBucket', 'precipBucket', 'fire_class_index', 
             'discovery_month_index', 'discovery_year_index']

labelIndexer = StringIndexer(inputCol = 'fire_cause', outputCol = 'label').fit(train)
featureIndexer = [StringIndexer(inputCol = column, outputCol = column + "_index").fit(train) for column in features]

assembler = VectorAssembler(inputCols = features2, outputCol = "features")

rf = RandomForestClassifier(labelCol = 'label', featuresCol = 'features', impurity='gini', maxDepth=10, numTrees=35, featureSubsetStrategy='auto')

labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels = labelIndexer.labels)

# Stages for our pipeline below
stag2 = featureIndexer + [labelIndexer, tempBucketizer, windBucketizer, humidBucketizer, precipBucketizer, assembler, rf, labelConverter]

# COMMAND ----------

pl2 = Pipeline(stages=stag2)

# Training pipeline on the training data
plTraining2 = pl2.fit(train)

# Testing pipeline on the test data
predTest2 = plTraining2.transform(test)
Ejemplo n.º 12
0
def randomForest(trainingData,
                 testData,
                 impurity,
                 maxDepth,
                 maxBins,
                 numTrees,
                 enableCrossValidator=False,
                 featuresCol='features',
                 labelCol='label',
                 predictionCol='prediction',
                 probabilityCol='probability',
                 rawPredictionCol='rawPrediction',
                 minInstancesPerNode=1,
                 minInfoGain=0.0,
                 maxMemoryInMB=256,
                 cacheNodeIds=False,
                 checkpointInterval=10,
                 featureSubsetStrategy='auto',
                 seed=None,
                 subsamplingRate=1.0):

    print("\nInizio classificazione con RandomForestClassifier")

    # Inizializzo il modello del classificatore con i parametri in input (e quelli default)
    rfc = RandomForestClassifier(featuresCol=featuresCol,
                                 labelCol=labelCol,
                                 predictionCol=predictionCol,
                                 probabilityCol=probabilityCol,
                                 rawPredictionCol=rawPredictionCol,
                                 maxDepth=maxDepth,
                                 maxBins=maxBins,
                                 minInstancesPerNode=minInstancesPerNode,
                                 minInfoGain=minInfoGain,
                                 maxMemoryInMB=maxMemoryInMB,
                                 cacheNodeIds=cacheNodeIds,
                                 checkpointInterval=checkpointInterval,
                                 impurity=impurity,
                                 numTrees=numTrees,
                                 featureSubsetStrategy=featureSubsetStrategy,
                                 seed=seed,
                                 subsamplingRate=subsamplingRate)

    print("    -modello creato")

    validator = None
    # In caso di cross validation
    if enableCrossValidator:
        # Creo la mappa dei parametri
        paramGrid = ParamGridBuilder().build()

        # Inizializzo l'evaluator
        evaluator = BinaryClassificationEvaluator()

        # Creo il sistema di k-fold cross validation, dove estiamtor è il classificatore da valutare e numFolds è il K
        crossVal = CrossValidator(estimator=rfc,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5)  # use 3+ folds in practice
        validator = crossVal
    else:
        validator = rfc

    print("    -validator creato")

    training = trainingData.map(lambda x: (x[31], Vectors.dense(x[1:29]), x[
        30])).toDF(schema=['index', 'features', 'label']).orderBy('index')

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    # tokenizer = Tokenizer(inputCol="features", outputCol="transactions")
    # hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures", numFeatures=29)

    pipeline = Pipeline(stages=[validator])

    model = pipeline.fit(training)

    print("    -modello addestrato con la pipeline (" + str(training.count()) +
          " elementi utilizzati come training)")

    test = testData.map(lambda x: (x[30], Vectors.dense(x[1:29]), x[31])).toDF(
        schema=['label', 'features', 'index']).orderBy('index')

    # prediction = predictions, label, index
    predictionsAndLabels = model.transform(test).rdd.map(lambda x:
                                                         (x[5], x[0], x[2]))

    print("    -" + str(predictionsAndLabels.count()) +
          " elementi predetti (" + str(test.count()) +
          " elementi usati come test)")

    return predictionsAndLabels
def main(base_path):

    APP_NAME = "train_spark_mllib_model.py"

    # SparkSession이 없으면 환경 설정
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.json".format(base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls))

    #
    # FlightNum을 대체할 Route 변수 추가
    #
    from pyspark.sql.functions import lit, concat
    features_with_route = features.withColumn(
        'Route', concat(features.Origin, lit('-'), features.Dest))
    features_with_route.show(6)

    #
    # pysmark.ml.feature.Bucketizer을 사용해 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화
    #
    from pyspark.ml.feature import Bucketizer

    # 구간화 모델 설정
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # 모델 저장
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # 모델 적용
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # pyspark.ml.feature의 특징 도구 임포트
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # 범주 필드를 인덱스로 전환
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # 원래 열을 제거
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # 파이프라인 모델을 저장
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦
    numeric_columns = ["DepDelay", "Distance"]
    index_columns = [
        "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
        "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index",
        "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # 숫자 벡터 어셈블러를 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 인덱스 열 제거
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # 확정된 특징 검사
    final_vectorized_features.show()

    #
    # 분류 모델을 교차 검증, 훈련, 평가: 4개의 지표에 대해 5회 반복
    #

    from collections import defaultdict
    scores = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("Run {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        # 테스트 데이터/훈련 데이터 분할
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # 전체 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시킴
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4657,
        )
        model = rfc.fit(training_data)

        # 예전 모델 대신 새 모델을 저장
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # 테스트 데이터를 사용해서 모델을 평가
        predictions = model.transform(test_data)

        # 각 지표에 대해 이 분할된 데이터의 결과를 평가
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:

            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

    #
    # 각 지표의 평균과 표준편차를 평가
    #
    import numpy as np
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        print("AVG {} = {:.3f}".format(metric_name, average_accuracy))

        std_accuracy = np.std(metric_scores)
        print("STD {} = {:.3f}".format(metric_name, std_accuracy))

    #
    # 각 지표의 평균과 표준편차를 평가
    #
    import numpy as np
    score_averages = defaultdict(float)

    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        print("AVG {} = {:.4f}".format(metric_name, average_accuracy))
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)
        print("STD {} = {:.4f}".format(metric_name, std_accuracy))

    #
    # 점수를 실행 사이에 존재하는 점수 로그에 유지
    #
    import pickle

    # 점수 로그를 적재하거나 빈 로그를 초기화
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    #  기존 점수 로그 계산
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # 각 지표에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        print("{} delta: {:.4f}".format(metric_name, run_delta))

    # 기존 평균 점수를 로그에 추가
    score_log.append(score_log_entry)

    #  다음 번 실행을 위해 로그 유지
    pickle.dump(score_log, open(score_log_filename, "wb"))
Ejemplo n.º 14
0
nn = MultilayerPerceptronClassifier().setLayers(layers).setLabelCol(
    'speciesIndex').setFeaturesCol('features').setBlockSize(
        training_data.count()).setSeed(1234)
model = nn.fit(training_data)
classifications = model.transform(test_data)
accuracy = evaluator.evaluate(classifications)
print("Accuracy: %s" % accuracy)

# ### Random Forest

# In[19]:

from pyspark.ml.classification import RandomForestClassifier

# In[20]:

rf = RandomForestClassifier().setLabelCol('speciesIndex').setFeaturesCol(
    'features').setNumTrees(40)

# In[21]:

model = rf.fit(training_data)

# In[22]:

classifications = model.transform(test_data)
accuracy = evaluator.evaluate(classifications)
print("Accuracy: %s" % accuracy)

# In[ ]:
Ejemplo n.º 15
0
predict_test.select("survived", "prediction").show()

tp = predict_test.filter((col("survived") == 0)
                         & (col("prediction") == 0)).count()
tn = predict_test.filter((col("survived") == 1)
                         & (col("prediction") == 1)).count()
fp = predict_test.filter((col("survived") == 1)
                         & (col("prediction") == 0)).count()
fn = predict_test.filter((col("survived") == 0)
                         & (col("prediction") == 1)).count()
print(tp, tn, fp, fn)

print("acc=", (tp + tn) / (tp + tn + fp + fn))

from pyspark.ml.classification import RandomForestClassifier
dt = RandomForestClassifier(labelCol="Survived", featuresCol="features")
dtmodel = dt.fit(train)
predict_test = dtmodel.transform(test)
predict_test.select("survived", "prediction").show()

tp = predict_test.filter((col("survived") == 0)
                         & (col("prediction") == 0)).count()
tn = predict_test.filter((col("survived") == 1)
                         & (col("prediction") == 1)).count()
fp = predict_test.filter((col("survived") == 1)
                         & (col("prediction") == 0)).count()
fn = predict_test.filter((col("survived") == 0)
                         & (col("prediction") == 1)).count()
print(tp, tn, fp, fn)

print("acc=", (tp + tn) / (tp + tn + fp + fn))
    print("Area under PR curve: " + str(area_under_pr))
    print("F1 score = %g" % f1_score)
    print("Accuracy = %g" % accuracy)
    print(
        "########################################################################"
    )

    # Display the label and the prediction for the first 10 pairs.
    lr_result.select('label', 'pred').show(10)

    # ****************************************************************************** #
    # Run Random Forest Classification.                                              #
    # ****************************************************************************** #

    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='label',
                                predictionCol='pred',
                                rawPredictionCol='pred_raw')
    rf_model = rf.fit(trainDF)
    rf_result = rf_model.transform(testDF)

    area_under_pr = evaluator1.evaluate(rf_result)
    f1_score = evaluator2.evaluate(rf_result)
    accuracy = evaluator3.evaluate(rf_result)

    print("")
    print(
        "########################################################################"
    )
    print("RANDOM FOREST RESULTS")
    print("Area under PR curve: " + str(area_under_pr))
    print("F1 score = %g" % f1_score)
                            labelCol='label',
                            maxDepth=3)
dtModel = dt.fit(train)
predictions = dtModel.transform(test)
predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction',
                   'probability').show(10)

#22
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(
    evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

#23
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol='features', labelCol='label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction',
                   'probability').show(10)

#24
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(
    evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

#31
print(rf.explainParams())

#27
from pyspark.ml.classification import GBTClassifier
Ejemplo n.º 18
0
print('AUC:', BinaryClassificationMetrics(predictions['label','prediction'].rdd).areaUnderROC)
bestModel = cvModel.bestModel

#applicable to your model to pull list of all stages
for x in range(len(bestModel.stages)):
  print(bestModel.stages[x])

print(bestModel.stages[3].extractParamMap())

# COMMAND ----------

#CV model of Random Forest Classifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
rf = ( RandomForestClassifier()
        .setFeaturesCol("features")
        .setLabelCol("label")
     )
from pyspark.ml import Pipeline

pipeline =  Pipeline().setStages([
  ipindexer, # categorize internation_plan
  labelindexer, # categorize churn
  assembler, # assemble the feature vector for all columns
  rf])
pipelineModel = pipeline.fit(trainDF)

numFolds = 3

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
Ejemplo n.º 19
0
split_col = pyspark.sql.functions.split(df['name'], '_')
df = df.withColumn('patient', split_col.getItem(2))

# Select training patients at random
number_of_training_patients = int(
    df.select('patient').distinct().count() * TRAINING_RATIO)
training_patients = df.select('patient').distinct().orderBy(
    rand(seed=1)).limit(number_of_training_patients)

# Divide into training and test data
trainingData = df.join(training_patients, ['patient'], 'inner')
testData = df.join(training_patients, ['patient'], 'leftanti')

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            numTrees=NUM_TREES,
                            maxDepth=MAX_DEPTH)

# Make pipeline from the stages
pipeline = Pipeline(stages=[labelIndexer, features, rf])

# Train model
model = pipeline.fit(trainingData)

# Make prediction for the test set
predictions = model.transform(testData)


def get_metrics(predictions):
    auc = BinaryClassificationEvaluator().evaluate(predictions)
Ejemplo n.º 20
0
print 'Train Data Number of Row: '+ str(train.count())
print 'Validate Data Number of Row: '+ str(validate.count())
print 'Test Data Number of Row: '+ str(test.count())

# Apply Logsitic Regression
from pyspark.ml.classification import LogisticRegression

# regPara: regualrization parameter
lr = LogisticRegression(maxIter = 100, regParam = 0.05, labelCol='index').fit(train)

# Evaluate model based on auc ROC(default for binary classification)
from pyspark.ml.evaluation import BinaryClassificationEvaluator

def testModel(model, validate = validate):
	pred = model.transform(validate)
	evaluator = BinaryClassificationEvaluator(labelCol = 'index')
	return evaluator.evaluate(pred)

from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier


dt = DecisionTreeClassifier(maxDepth = 3, labelCol ='index').fit(train)
rf = RandomForestClassifier(numTrees = 100, labelCol = 'index').fit(train)


models = {'LogisticRegression':lr,
		  'DecistionTree':dt,
		  'RandomForest':rf}

modelPerf = {k:testModel(v) for k,v in models.iteritems()}
Ejemplo n.º 21
0
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from mmlspark.train import TrainClassifier
import itertools

lrHyperParams = [0.05, 0.2]
logisticRegressions = [
    LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams
]
lrmodels = [
    TrainClassifier(model=lrm, labelCol="label").fit(ptrain)
    for lrm in logisticRegressions
]

rfHyperParams = itertools.product([5, 10], [2, 3])
randomForests = [
    RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])
    for hyperParam in rfHyperParams
]
rfmodels = [
    TrainClassifier(model=rfm, labelCol="label").fit(ptrain)
    for rfm in randomForests
]

gbtHyperParams = itertools.product([8, 16], [2, 3])
gbtclassifiers = [
    GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1])
    for hyperParam in gbtHyperParams
]
gbtmodels = [
    TrainClassifier(model=gbt, labelCol="label").fit(ptrain)
    for gbt in gbtclassifiers
Ejemplo n.º 22
0
                                               metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Decision Tree model is %f' % evaluator1.evaluate(pred))
print('F1 score of Decision Tree model is %f' % evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context323"></a>
# #### 3.2.3. Random Forest

# In[19]:

from pyspark.ml.classification import RandomForestClassifier

# model on training data numTrees is the hyperparameter
rfModel = RandomForestClassifier(numTrees=100).fit(trainData)

# make prediction on test data
pred = rfModel.transform(testData)

pred.select('catLabel', 'label', 'prediction').show()

evaluator1 = BinaryClassificationEvaluator(labelCol='label',
                                           metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label',
                                               metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Random Forest model is %f' % evaluator1.evaluate(pred))
print('F1 score of Random Forest model is %f' % evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()
# COMMAND ----------

# MAGIC %md
# MAGIC ####Random Forest
# MAGIC
# MAGIC Random Forests uses an ensemble of trees to improve model accuracy.
# MAGIC
# MAGIC You can read more about Random Forest from the programming guide [here](http://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests).

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)

# COMMAND ----------

# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)

# COMMAND ----------

predictions.printSchema()

# COMMAND ----------
Ejemplo n.º 24
0
# Indexar el campo  `vehicle_color`:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="vehicle_color", outputCol="vehicle_color_indexed")

# crear un dummy para la categorica de  `vehicle_color_indexed`:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="vehicle_color_indexed", outputCol="vehicle_color_encoded")

# seleccionar los features 
from pyspark.ml.feature import VectorAssembler
features = ["reviewed", "vehicle_year", "vehicle_color_encoded", "CloudCover"]
assembler = VectorAssembler(inputCols=features, outputCol="features")

# especificar el estimador (i.e., classification algorithm):
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(featuresCol="features", labelCol="star_rating")
print(classifier.explainParams())

# espeficar los valores en el grid de hiperparametros:
from pyspark.ml.tuning import ParamGridBuilder
maxDepthList = [5, 10, 20]
numTreesList = [20, 50, 100]
subsamplingRateList = [0.5, 1.0]
paramGrid = ParamGridBuilder() \
  .addGrid(classifier.maxDepth, maxDepthList) \
  .addGrid(classifier.numTrees, numTreesList) \
  .addGrid(classifier.subsamplingRate, subsamplingRateList) \
  .build()

# especificar el evaluador:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
Ejemplo n.º 25
0
    print(
        f" | recall = {metrics.recall()}\n | F1-Score = {metrics.fMeasure()}")

    conf_matrix = metrics.confusionMatrix().toArray()
    sns.set(font_scale=1.4)  #for label size
    ax = sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 16})
    ax.set(xlabel='Predicted Label',
           ylabel='True Label',
           title='Confusion Mtx')
    plt.show()


# In[127]:

#Random forest classifier model
rando_forest = RandomForestClassifier(numTrees=10)
rando_forest_model = rando_forest.fit(train)
rando_forest_preds = rando_forest_model.transform(validation)
custom_evaluation(rando_forest_preds, 'Random Forest')

# In[128]:

#Gradient boosted trees (ie ada boost)
gbtrees = GBTClassifier(maxIter=10)
gbtree_model = gbtrees.fit(train)
gbtree_preds = gbtree_model.transform(validation)
custom_evaluation(gbtree_preds, 'Gradient Boosted Trees')

# In[129]:

#SVM
def main(base_path):
  
  # Default to "."
  try: base_path
  except NameError: base_path = "."
  if not base_path:
    base_path = "."
  
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except (NameError, UnboundLocalError) as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),     # "ArrDelay":5.0
    StructField("CRSArrTime", TimestampType(), True),    # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
    StructField("CRSDepTime", TimestampType(), True),    # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
    StructField("Carrier", StringType(), True),     # "Carrier":"WN"
    StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31
    StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
    StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
    StructField("DepDelay", DoubleType(), True),     # "DepDelay":14.0
    StructField("Dest", StringType(), True),        # "Dest":"SAN"
    StructField("Distance", DoubleType(), True),     # "Distance":368.0
    StructField("FlightDate", DateType(), True),    # "FlightDate":"2015-12-30T16:00:00.000-08:00"
    StructField("FlightNum", StringType(), True),   # "FlightNum":"6109"
    StructField("Origin", StringType(), True),      # "Origin":"TUS"
  ])
  
  input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print(list(cols_with_nulls))
  
  #
  # Add a Route variable to replace FlightNum
  #
  from pyspark.sql.functions import lit, concat
  features_with_route = features.withColumn(
    'Route',
    concat(
      features.Origin,
      lit('-'),
      features.Dest
    )
  )
  features_with_route.show(6)
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the bucketizer
  ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Drop the original column
    ml_bucketized_features = ml_bucketized_features.drop(column)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfMonth", "DayOfWeek",
    "DayOfYear"]
  index_columns = ["Carrier_index", "Origin_index",
                   "Dest_index", "Route_index"]
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  # Instantiate and fit random forest classifier on all the data
  from pyspark.ml.classification import RandomForestClassifier
  rfc = RandomForestClassifier(
    featuresCol="Features_vec",
    labelCol="ArrDelayBucket",
    predictionCol="Prediction",
    maxBins=4657,
    maxMemoryInMB=1024
  )
  model = rfc.fit(final_vectorized_features)
  
  # Save the new model over the old one
  model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  model.write().overwrite().save(model_output_path)
  
  # Evaluate model using test data
  predictions = model.transform(final_vectorized_features)
  
  from pyspark.ml.evaluation import MulticlassClassificationEvaluator
  evaluator = MulticlassClassificationEvaluator(
    predictionCol="Prediction",
    labelCol="ArrDelayBucket",
    metricName="accuracy"
  )
  accuracy = evaluator.evaluate(predictions)
  print("Accuracy = {}".format(accuracy))
  
  # Check the distribution of predictions
  predictions.groupBy("Prediction").count().show()
  
  # Check a sample
  predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
Ejemplo n.º 27
0
df = df.drop(*drop_cols)
df.cache()

print("Creating Splits")
train, test = df.randomSplit([0.7, 0.3])

print("Selected Features Count: {0}".format(len(selected_cols)))
print("Selected Features: {0}".format(selected_cols))

print("Building Pipeline")
hasher = FeatureHasher(numFeatures=1024,
                       inputCols=selected_cols,
                       outputCol="features",
                       categoricalCols=selected_cols)
forest = RandomForestClassifier(featuresCol="features",
                                labelCol="HasDetections",
                                predictionCol="prediction",
                                probabilityCol="probability")

pipeline = Pipeline(stages=[hasher, forest])
evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections",
                                              predictionCol="prediction",
                                              metricName="accuracy")

print("Configuring Validation")
params = ParamGridBuilder() \
   .addGrid(hasher.numFeatures, [1024]) \
   .addGrid(forest.maxDepth, [30]) \
   .addGrid(forest.maxBins, [64]) \
   .addGrid(forest.numTrees, [100]) \
   .build()
Ejemplo n.º 28
0
                                          ["features", "label"])
        labelIndexer = StringIndexer(inputCol="label",
                                     outputCol="indexedLabel").fit(trainData)
        trainData = labelIndexer.transform(trainData)
        label = labelIndexer.labels
        labelDict = {}
        for i in range(len(label)):
            labelDict[label[i]] = i
        labelValIndex = list(labelDict.items())
        labelRdd = sc.parallelize(labelValIndex)
        labelDF = spark.createDataFrame(labelRdd, ['secID', 'index'])
        labelDF.write.save(
            'hdfs://master:9000//fcd/completeLabelIndexer/labelIndexer_{}'.
            format(index),
            format='parquet',
            mode='append')

        # df = spark.read.format('parquet').load('hdfs://master:9000//sparkExperiment/labelIndexer/labelIndexer_60438')

        rf = RandomForestClassifier(numTrees=3,
                                    maxDepth=2,
                                    labelCol='indexedLabel',
                                    featuresCol='features',
                                    seed=42)
        model1 = rf.fit(trainData)
        model1.save(
            'hdfs://master:9000//fcd/completeModel/model_{}'.format(index))
    end = time.time()
    print('训练花费时间: {}s'.format(end - start))
    sc.stop()
Ejemplo n.º 29
0
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=4).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3], 13795)

    trainingData.persist()
    testData.persist()
    print("Number of training set rows: %d" % trainingData.count())
    print("Number of test set rows: %d" % testData.count())

    # Train a RandomForest model.
    rf = RandomForestClassifier(labelCol="indexedLabel",
                                featuresCol="indexedFeatures",
                                numTrees=10)

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction",
                                   outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(
        stages=[labelIndexer, featureIndexer, rf, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label')
plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed')
input_cols=['intl_plan_indexed'] + reduced_numeric_cols
assembler = VectorAssembler(
    inputCols = input_cols,
    outputCol = 'features')

param_numTrees=int(sys.argv[1])
param_maxDepth=int(sys.argv[2])
param_impurity=sys.argv[3]

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(labelCol = 'label', 
                                    featuresCol = 'features', 
                                    numTrees = param_numTrees, 
                                    maxDepth = param_maxDepth,  
                                    impurity = param_impurity)
pipeline = Pipeline(stages=[plan_indexer, label_indexer, assembler, classifier])
(train, test) = churn_data.randomSplit([0.7, 0.3])
model = pipeline.fit(train)

cdsw.track_metric("numTrees",param_numTrees)
cdsw.track_metric("maxDepth",param_maxDepth)
cdsw.track_metric("impurity",param_impurity)


from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import udf
predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator()