Example #1
0
def train_random_forest(df):
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed",
                                seed=int(random.random()))
    return rf, rf.fit(td)
def build_randomForest(path):
    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show()

    rdf = RandomForestClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\
                            .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = rdf.fit(df)

    prediction = cvModel.transform(df)
    prediction.show()

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age
def testClassification(data):
    # Train a GradientBoostedTrees model.

    stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    si_model = stringIndexer.fit(data)
    td = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)

    trainData,testData = td.randomSplit([0.8,0.2],13)

    predictionDF = rf.fit(trainData).transform(testData)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.collect():
        print row

    scoresAndLabels = predictionDF\
       .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
    for sl in scoresAndLabels.collect():
        print sl
    evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
    metric = evaluator.evaluate(selected)
    print metric
def testClassification(train, test):
    # Train a RandomForest model.
    # Setting featureSubsetStrategy="auto" lets the algorithm choose.
    # Note: Use larger numTrees in practice.

    rf = RandomForestClassifier(labelCol="indexedLabel", numTrees=3, maxDepth=4)

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = MulticlassMetrics(predictionAndLabels)
    print("weighted f-measure %.3f" % metrics.weightedFMeasure())
    print("precision %s" % metrics.precision())
    print("recall %s" % metrics.recall())
Example #5
0
mkdir(workdir + f'data/urf_{i}')

spark = init_spark()
neg_samples = get_negative_samples(spark).sample(1.0)
pos_samples = get_positive_samples(spark)

imbalance_ratio = (neg_samples.count() / pos_samples.count())

train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples)
train_set, test_set = train_set.persist(), test_set.persist()

rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            cacheNodeIds=True,
                            maxDepth=17,
                            impurity='entropy',
                            featureSubsetStrategy='sqrt',
                            minInstancesPerNode=10,
                            numTrees=100,
                            subsamplingRate=1.0,
                            maxMemoryInMB=768)
ru = (
    RandomUnderSampler().setIndexCol('sample_id').setTargetImbalanceRatio(1.0))
pipeline = Pipeline().setStages([ru, rf])
model = pipeline.fit(train_set)


# Write model hyper-parameters
def write_params(model, path):
    with open(path, 'w') as file:
        for stage in model.stages:
            params = stage.extractParamMap()
Example #6
0
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=100)
lrModel = lr.fit(train_ML)

# In[36]:

predictions = lrModel.transform(test_ML)

# - Random Forest

# In[37]:

from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='label',
                            featuresCol='features',
                            numTrees=20,
                            maxDepth=20)
rfModel = rf.fit(train_ML)

# In[38]:

predictionsrf = rfModel.transform(test_ML)

# - Evaluation Metrics

# In[40]:

from pyspark.mllib.evaluation import MulticlassMetrics

results = predictions.select(['prediction', 'label'])
predictionAndLabels = results.rdd
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label",
                             outputCol="indexedLabel").fit(data)

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures",
                            numTrees=10)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction",
                               outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
 'imu_46',
 'imu_47',
 'imu_48',
 'imu_49',
 'imu_50'],outputCol='VectorFeatures')

#Random Forest classifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import time
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

classifier = RandomForestClassifier(labelCol='Activity_id',featuresCol='VectorFeatures',numTrees=200)
int_pipe = Pipeline(stages=[vec_assembler,classifier])
starttime = time.time()
model = int_pipe.fit(train)
prediction = model.transform(test)
t = time.time() - starttime
print("Time Taken = ", t)
acc_eval  = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='Activity_id')
a = acc_eval.evaluate(prediction)
print("Accuracy = ", a)
f = open("/N/u/risnaga/time.txt", 'a+')
f.write("Time Taken = " + str(t) + '\n')
f.close()
f = open("/N/u/risnaga/accu.txt", 'a+')
f.write("Accuracy = " + str(a) + '\n')
f.close()
pipeline = Pipeline(stages=stages_feat)
pipelineModel = pipeline.fit(df_yog)
df_yog = pipelineModel.transform(df_yog)
selected_Cols = ['Pred_Label', 'features_all'] + cloum_set
df_yog = df_yog.select(selected_Cols)
# df_yog.printSchema()

# splits = df_yog.randomSplit([0.6,0.4], 1234)
training_data, testing_data = df_yog.randomSplit([0.6805, 0.3195],
                                                 seed=99999999)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(testing_data.count()))

#Model
RandomForest = RandomForestClassifier(labelCol="Pred_Label",
                                      featuresCol="features_all",
                                      numTrees=10)

start = time.time()
RandomForestModel = RandomForest.fit(training_data)
end = time.time()
start1 = time.time()
f_predictions = RandomForestModel.transform(testing_data)
end1 = time.time()

# #PRINT CONFUSION MATRIX
# Cm=f_predictions.select("PoutLabel","label").distinct().toPandas()
# f_predictions.groupBy("PoutLabel","prediction").count().show()

print("Time to train:")
print(end - start)
Example #10
0
# In[53]:

data.show()
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# In[54]:

from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier,
                                       DecisionTreeClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

(trainingData, testData) = data.randomSplit([0.7, 0.3])
rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            numTrees=20)
model_rf = rf.fit(trainingData)

# In[55]:

prediction_rf = model_rf.transform(testData)

# In[56]:

prediction_rf.show()
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction_rf)
accuracy
Example #11
0
df.describe().show()



from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')
output = assembler.transform(df)



output.printSchema()



from pyspark.ml.classification import RandomForestClassifier,GBTClassifier, DecisionTreeClassifier
rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')



final_data = output.select('features', 'Spoiled')



final_data.show()



rfcModel = rfc.fit(final_data)


def main(base_path):

    # Default to "."
    try:
        base_path
    except NameError:
        base_path = "."
    if not base_path:
        base_path = "."

    APP_NAME = "train_spark_mllib_model.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except (NameError, UnboundLocalError) as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls))

    #
    # Add a Route variable to replace FlightNum
    #
    from pyspark.sql.functions import lit, concat
    features_with_route = features.withColumn(
        'Route', concat(features.Origin, lit('-'), features.Dest))
    features_with_route.show(6)

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Setup the Departure Bucketizer for other examples
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    departure_bucketizer = Bucketizer(splits=splits,
                                      inputCol="DepDelay",
                                      outputCol="DepDelayBucket")

    # Save the departure bucketizer
    departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format(
        base_path)
    departure_bucketizer.write().overwrite().save(departure_bucketizer_path)

    # Apply the bucketizer
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Drop the original column
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = [
        "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear"
    ]
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(featuresCol="Features_vec",
                                 labelCol="ArrDelayBucket",
                                 predictionCol="Prediction",
                                 maxBins=4657,
                                 maxMemoryInMB=1024)
    model = rfc.fit(final_vectorized_features)

    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    model.write().overwrite().save(model_output_path)

    # Evaluate model using test data
    predictions = model.transform(final_vectorized_features)

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction",
                                                  labelCol="ArrDelayBucket",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = {}".format(accuracy))

    # Check the distribution of predictions
    predictions.groupBy("Prediction").count().show()

    # Check a sample
    predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
Example #14
0
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\
map(lambda x:x.split(","))
D = 2 ** 24 

def helper1(r):
    features=[]
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
        target = float(r[-1])
        ID=float(r[0])
        return target, Vectors.dense(features)
    except:
        return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
(trainingData, testData) = df.randomSplit([0.7, 0.3])

stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(trainingData)
td = si_model.transform(trainingData)

rf = RandomForestClassifier(numTrees=50, maxDepth=25, labelCol="indexed", seed=42)
model = rf.fit(td)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/rf_50_25")

def main(args):
    textFiles = sc.wholeTextFiles(maindir + '4').map(readContents)
    #print "READ second {} check ".format(textFiles.take(10))
    '''
        filter the rows based on all the index available in
        training file else drop
        http://stackoverflow.com/questions/24718697/pyspark-drop-rows
    '''

    htmldf = sqlContext.createDataFrame(textFiles)
    htmldf.cache()


    traindf = getCleanedRDD(maindir + 'train_v2.csv', ["id", "images", "links", "text", "label"], htmldf)
    traindf.write.save(maindir+"output/train_4.parquet", format="parquet")



    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20, regParam=0.01)
    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="label")
    rf = RandomForestClassifier(labelCol="features", numTrees=3, maxDepth=4)
    #https://databricks.com/blog/2015/07/29/new-features-in-machine-learning-pipelines-in-spark-1-4.html
    #http://spark.apache.org/docs/latest/api/python/pyspark.ml.html

    #w2v = Word2Vec(inputCol="text", outputCol="w2v")

    rfc = RandomForestClassifier(labelCol="label", numTrees=3, maxDepth=4)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])



    # Fit the pipeline to training documents.
    model = pipeline.fit(traindf)

    print '-----------------------------------------------------------------------------'
    testdf = getCleanedRDD(maindir + 'test.csv', ["id", "images", "links", "text", "label"], htmldf)
    #print testdf.count()



    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(testdf)
    #print('prediction', prediction)

    '''
    pand = prediction.toPandas()
    pand.to_csv('testpanda.csv', sep='\t', encoding='utf-8')	
    print "Done!!! CSV"

    '''
    #prediction.select('id','probability','prediction').write.format('com.databricks.spark.csv').option("header", "true").save(maindir + 'output/result_lr0.csv')
    # ('prediction', DataFrame[id: string, images: bigint, links: bigint, text: string, label: double,
    # words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double])

    '''
    #write in scala
    selected = prediction.select("id", "probability", "prediction")
    for row in selected.collect():
        print row
    '''
    sc.stop()
                            outputCol='features')

# Consolidate predictor columns
flites = assembler.transform(flites)

print("Sample model input")
print(flites.toPandas().sample(12))

# Split the data into training and testing sets
flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23)

# Create model objects and train on training data
#tree = DecisionTreeClassifier().fit(flights_train)
#gbt = GBTClassifier().fit(flights_train)

forest = RandomForestClassifier()

# Create parameter grid
params = ParamGridBuilder()
# Add grids for two parameters
params = params.addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
               .addGrid(forest.maxDepth, [2, 5, 10])

# Build the parameter grid
params = params.build()

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()

# create cross-validation object
cv = CrossValidator(estimator=forest,
# 
# Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA

# In[509]:

pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)

train_df = pca.transform(train_df)
test_df = pca.transform(test_df)


# ## Classification algorithms

# In[ ]:

rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000)
#rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000)
model = rf.fit(train_df)


# ## Evaluation & results

# In[ ]:

label_to_str_map = {'2': 'HOME', '1': 'DRAW', '0': 'AWAY'}
str_to_labelmap = {'HOME': '2', 'DRAW': '1', 'AWAY': '0'}
predictions = model.transform(test_df).select("home_name", "away_name", "B365A", "B365D", "B365H", "probability", 
                                              "indexedResult")

length = test_df.count()
correct = 0
#transforming the words to vectors using the trained model
transformDF = wvModel.transform(reviewDF)
#segregating the labels and features
selectData = transformDF.select("label","features","id")
#Creating RDD of LabeledPoints
lpSelectData = selectData.map(lambda x : (x.id, LabeledPoint(x.label,x.features)))
#Instantiating string indexer for random forest
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
#fitting the data in stringindexer
si_model = stringIndexer.fit(selectData)
#transforming the data
transformData = si_model.transform(selectData)
#Spliting the data for training and test
(trainingData, testData) = transformData.randomSplit([0.6, 0.4])
#instantiating Random forest model
randomForest = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42)
#training the model
randomForestModel = randomForest.fit(trainingData)
#trsnforming test data
result = randomForestModel.transform(testData)
#calculating the accuracy and printing it.
accuracy = result.filter(result.label == result.prediction).count() / float(testData.count())
print("Accuracy = " + str(accuracy))







 'global_subjectivity',
 'global_sentiment_polarity',
 'title_subjectivity',
 'title_sentiment_polarity',
 'abs_title_subjectivity',
 'abs_title_sentiment_polarity'],outputCol='features' )
new_data = assembler.transform(data)


final_data = new_data.select('features','shares')
from pyspark.ml.feature import QuantileDiscretizer
discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result")
result = discretizer.fit(final_data).transform(final_data)
finalData = result.select('result','features')
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features')
train_data,test_data = finalData.randomSplit([0.7,0.3])
rfc_model = rfc.fit(train_data)
result = rfc_model.transform(test_data);
from pyspark.ml.evaluation import BinaryClassificationEvaluator
acc_eval = BinaryClassificationEvaluator(labelCol='result')
print(acc_eval.evaluate(result))
test_data.head(1)


# import os, sys
# import pandas
# import plotly.plotly as py
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import cufflinks as cf
# import plotly.graph_objs as go
# COMMAND ----------

# MAGIC %md
# MAGIC ####Random Forest
# MAGIC 
# MAGIC Random Forests uses an ensemble of trees to improve model accuracy.
# MAGIC 
# MAGIC You can read more about Random Forest from the programming guide [here](http://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests).

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Train model with Training Data
rfModel = rf.fit(trainingData)

# COMMAND ----------

# Make predictions on test data using the Transformer.transform() method.
predictions = rfModel.transform(testData)

# COMMAND ----------

predictions.printSchema()

# COMMAND ----------
Example #21
0
def main(base_path):
  
  # 기본 값은 "."
  try: base_path
  except NameError: base_path = "."
  if not base_path:
    base_path = "."
  
  APP_NAME = "train_spark_mllib_model.py"
  
  # SparkSession이 없으면 환경 생성
  try:
    sc and spark
  except (NameError, UnboundLocalError) as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),     # "ArrDelay":5.0
    StructField("CRSArrTime", TimestampType(), True),    # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
    StructField("CRSDepTime", TimestampType(), True),    # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
    StructField("Carrier", StringType(), True),     # "Carrier":"WN"
    StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31
    StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
    StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
    StructField("DepDelay", DoubleType(), True),     # "DepDelay":14.0
    StructField("Dest", StringType(), True),        # "Dest":"SAN"
    StructField("Distance", DoubleType(), True),     # "Distance":368.0
    StructField("FlightDate", DateType(), True),    # "FlightDate":"2015-12-30T16:00:00.000-08:00"
    StructField("FlightNum", StringType(), True),   # "FlightNum":"6109"
    StructField("Origin", StringType(), True),      # "Origin":"TUS"
  ])
  
  input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인
  #
  null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print(list(cols_with_nulls))
  
  #
  # FlightNum을 대체할 Route 변수 추가
  #
  from pyspark.sql.functions import lit, concat
  features_with_route = features.withColumn(
    'Route',
    concat(
      features.Origin,
      lit('-'),
      features.Dest
    )
  )
  features_with_route.show(6)
  
  #
  # pysmark.ml.feature.Bucketizer을 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화
  #
  from pyspark.ml.feature import Bucketizer
  
  # 구간 설정 모델 설정
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # 모델 저장
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # 모델 적용
  ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # pyspark.ml.feature의 특징 도구 임포트
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # 범주 필드를 인덱스로 전환
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # 원래의 열 제거
    ml_bucketized_features = ml_bucketized_features.drop(column)
    
    # 파이프라인 모델 저장
    string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦  
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfMonth", "DayOfWeek",
    "DayOfYear"]
  index_columns = ["Carrier_index", "Origin_index",
                   "Dest_index", "Route_index"]
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # 수치 벡터 어셈블러 저장
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # 인덱스 열 제거
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # ㅔ 확정된 특징을 검사
  final_vectorized_features.show()
  
  # 전체 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시키기
  from pyspark.ml.classification import RandomForestClassifier
  rfc = RandomForestClassifier(
    featuresCol="Features_vec",
    labelCol="ArrDelayBucket",
    predictionCol="Prediction",
    maxBins=4657,
    maxMemoryInMB=1024
  )
  model = rfc.fit(final_vectorized_features)
  
  # 예전 모델 대신 새 모델을 저장
  model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  model.write().overwrite().save(model_output_path)
  
  # 테스트 데이터를 사용하여 모델 평가
  predictions = model.transform(final_vectorized_features)
  
  from pyspark.ml.evaluation import MulticlassClassificationEvaluator
  evaluator = MulticlassClassificationEvaluator(
    predictionCol="Prediction",
    labelCol="ArrDelayBucket",
    metricName="accuracy"
  )
  accuracy = evaluator.evaluate(predictions)
  print("Accuracy = {}".format(accuracy))
  
  # 예측 분포 확인
  predictions.groupBy("Prediction").count().show()
  
  # 표본 확인
  predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
Example #22
0
def build_model(df_ml):
    '''
    Function builds a classification model based on the user features
    
    INPUT:
        df_ml 
        
    OUTPUT:
        model - final trained model
    '''

    # split into train, test and validation sets (60% - 20% - 20%)
    df_ml = df_ml.withColumnRenamed("churn", "label")

    train, test_valid = df_ml.randomSplit([0.7, 0.3], seed=2048)
    test, validation = test_valid.randomSplit([0.5, 0.5], seed=2048)

    # index and encode categorical features gender, level and state

    stringIndexerGender = StringIndexer(inputCol="gender",
                                        outputCol="genderIndex",
                                        handleInvalid='skip')
    stringIndexerLevel = StringIndexer(inputCol="last_level",
                                       outputCol="levelIndex",
                                       handleInvalid='skip')
    stringIndexerState = StringIndexer(inputCol="last_state",
                                       outputCol="stateIndex",
                                       handleInvalid='skip')

    encoder = OneHotEncoderEstimator(
        inputCols=["genderIndex", "levelIndex", "stateIndex"],
        outputCols=["genderVec", "levelVec", "stateVec"],
        handleInvalid='keep')

    # create vector for features
    features = [
        'genderVec', 'levelVec', 'stateVec', 'days_active', 'avg_songs',
        'avg_events', 'thumbs_up', 'thumbs_down', 'addfriend'
    ]
    assembler = VectorAssembler(inputCols=features, outputCol="rawFeatures")

    # normalize features
    normalizer = Normalizer(inputCol="rawFeatures",
                            outputCol="features",
                            p=1.0)

    # initialize random forest classifier with tuned hyperparameters
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="features",
                                numTrees=120,
                                impurity='gini',
                                maxDepth=5,
                                featureSubsetStrategy='sqrt')

    # assemble pipeline
    pipeline = Pipeline(stages=[
        stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder,
        assembler, normalizer, rf
    ])

    # fit model
    model = pipeline.fit(train)

    # predict churn
    pred_train = model.transform(train)
    pred_test = model.transform(test)
    pred_valid = model.transform(validation)

    # evaluate results
    predictionAndLabels = pred_train.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # print F1-score
    print("Train F1: %s" % metrics.fMeasure())

    predictionAndLabels = pred_test.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # F1 score
    print("Test F1: %s" % metrics.fMeasure())

    predictionAndLabels = pred_valid.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # F1 score
    print("Validation F1: %s" % metrics.fMeasure())

    return model
# COMMAND ----------

#DEFINING THE MODELS AND PARAMETERS

#NOTE: Due to the time Databricks was taking to run and that after 2 hours the clusters detaches, we are keeping only these combinations of hyperparameters in here. The models were run with more combinations of Hyperparameters in a Jupyter Notebook that is attached. 

#NAIVE BAYES MODEL
nb = NaiveBayes()
nbParams = ParamGridBuilder().addGrid(nb.smoothing, [0.01,1]).build()

#LOGISTIC REGRESSION MODEL
lr = LogisticRegression()
lrParams = ParamGridBuilder().addGrid(lr.maxIter, [10, 150]).build()

#RANDOM FOREST MODEL
rfc = RandomForestClassifier()
rfParams = ParamGridBuilder().addGrid(rfc.numTrees, [150, 300]).build()

#DECISION TREE
dt = DecisionTreeClassifier()
dtParams = ParamGridBuilder().addGrid(dt.maxDepth, [4, 10]).build()

#GRADIENT BOOSTING MODEL
gb = GBTClassifier()
gbParams = ParamGridBuilder().addGrid(gb.maxDepth,[2,4]).build()



### Hyperparameters used for the analysis (ran in Jypyter) ###
# #Gradient Boosting
# gb = GradientBoostingClassifier()
Example #24
0
label_stringIdx = StringIndexer(inputCol="income", outputCol="label")

pipeline = Pipeline(stages=indexers + [label_stringIdx, encoder, assembler])
encoded_df = pipeline.fit(df).transform(df)

selectedCols = ['label', 'features'] + cols
dataset = encoded_df.select(selectedCols)

# Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

# Fit model and train
rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            numTrees=10)
rf2 = RandomForestClassifier(labelCol="label",
                             featuresCol="features",
                             numTrees=100)

model = rf.fit(trainingData)
model2 = rf2.fit(trainingData)
predictions = model.transform(testData)
predictions2 = model2.transform(testData)

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
accuracy2 = evaluator.evaluate(predictions2)
data_df = event_df.select(*exprs)

# In[ ]:

(train_df, test_df) = data_df.randomSplit([0.9, 0.1])

# In[ ]:

labelIndexer = StringIndexer(inputCol="target",
                             outputCol="label").fit(train_df)

featureAssembler = VectorAssembler(
    inputCols=[x for x in field_names if x.startswith('attr')],
    outputCol="features")
rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            numTrees=10)
labelConverter = IndexToString(inputCol="prediction",
                               outputCol="predictedLabel",
                               labels=labelIndexer.labels)
pipeline = Pipeline(
    stages=[featureAssembler, labelIndexer, rf, labelConverter])

# In[ ]:

model = pipeline.fit(train_df)

# In[ ]:

predict_df = model.transform(test_df)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="lowhigh",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy= %g" % (accuracy))
print("Test Error = %g" % (1.0 - accuracy))

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf = RandomForestClassifier(labelCol="lowhigh",
                            featuresCol="features",
                            numTrees=10,
                            maxDepth=3)
model = rf.fit(train_df)

predictions = model.transform(test_df)
predictions.select("prediction", "lowhigh", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="lowhigh",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy= %g" % (accuracy))
print("Test Error = %g" % (1.0 - accuracy))

# COMMAND ----------
#计算TF-IDF
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=3000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(
    rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
print(trainingData.take(1))
rfClassifier = RandomForestClassifier(numTrees=10,
                                      maxDepth=10,
                                      seed=0,
                                      labelCol="indexed")

start_time = time.time()
modelClassifier = rfClassifier.fit(trainingData)
end_time = time.time()

cost_time = end_time - start_time
print("spark rf time  :", cost_time)

predictionsClassifier = modelClassifier.transform(testData)

evaluator = MulticlassClassificationEvaluator().setLabelCol(
    "indexed").setPredictionCol("prediction")
print(
    "accuracy = ",
Example #28
0
# data = data.select("*", F.when(data.X == ' <=50K', 1).when(data.X == ' >50K', 2).otherwise(0).alias('label'))
data = data.withColumnRenamed("age", "label").select("label", "education-num",
                                                     "hours-per-week")
data = data.select(data.label.cast("double"), "education-num",
                   "hours-per-week")

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)
data.show()
# Split data into training and test data set
training, test = data.select("label", "features").randomSplit([0.6, 0.4])

# Create Random Forest model and fit the model with training dataset
rf = RandomForestClassifier()
model = rf.fit(training)

# Generate prediction from test dataset
predictions = model.transform(test)

# Evuluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

# Show model accuracy
print("Accuracy:", accuracy)

# Report
predictionAndLabels = predictions.select("label", "prediction").rdd
metrics = MulticlassMetrics(predictionAndLabels)
Example #29
0
    # Evaluate model using the AUC metric
    auc_dt_default_dev = evaluator.evaluate(dt_predictions_default_dev, {evaluator.metricName: 'areaUnderROC'})

    # Print result to standard output
    print('Decision Tree, Default Parameters, Development Set, AUC: ' + str(auc_dt_default_dev))

    # TODO: Check for signs of overfitting (by evaluating the model on the training set)
    # [FIX ME!] Write code below

    # TODO: Tune the decision tree model by changing one of its hyperparameters
    # Build and evalute decision trees with the following maxDepth values: 3 and 4.
    # [FIX ME!] Write code below

    # Train a random forest with default parameters (including numTrees=20)
    rf_classifier_default = RandomForestClassifier(labelCol = 'label', featuresCol = 'TFIDF', numTrees=20)

    # Create an ML pipeline for the random forest model
    rf_pipeline_default = Pipeline(stages=[label_indexer, rf_classifier_default])

    # Apply pipeline and train model
    rf_model_default = rf_pipeline_default.fit(train_tfidf)

    # Apply model on development data
    rf_predictions_default_dev = rf_model_default.transform(dev_tfidf)

    # Evaluate model using the AUC metric
    auc_rf_default_dev = evaluator.evaluate(rf_predictions_default_dev, {evaluator.metricName: 'areaUnderROC'})

    # Print result to standard output
    print('Random Forest, Default Parameters, Development Set, AUC:' + str(auc_rf_default_dev))
Example #30
0
dt_train.show(5)

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
#assembler = VectorAssembler(inputCols =["Day","Temp","Lat","Long","Admin_index","Province_index"],outputCol="normfeatures")
assembler = VectorAssembler(inputCols=["Date", "Day", "Temp"],
                            outputCol="normfeatures")
#assembler = VectorAssembler(inputCols =["Date","Year","Day","Temp"],outputCol="features")
minMax = MinMaxScaler(inputCol=assembler.getOutputCol(), outputCol="nfeatures")
featVect = VectorAssembler(inputCols=["nfeatures"], outputCol="features")
dt = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            impurity="gini",
                            featureSubsetStrategy="auto",
                            numTrees=10,
                            maxDepth=30,
                            maxBins=128,
                            seed=1234)
pipeline = Pipeline(stages=[assembler, minMax, featVect, dt])

piplineModel = pipeline.fit(dt_train)
print("Pipeline complete!")
prediction = piplineModel.transform(dt_test)
predicted = prediction.select("features", "prediction", "trueLabel")
predicted.show(100, truncate=False)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluation = MulticlassClassificationEvaluator(labelCol="trueLabel",
                                               predictionCol="prediction",
                                               metricName="accuracy")
Example #31
0
scaler = MinMaxScaler(inputCol="atributos",
                      outputCol="scaledFeatures",
                      min=0.0,
                      max=1.0)
scalerModel = scaler.fit(fluxoDF)
scaledData = scalerModel.transform(fluxoDF)

# Indexação é pré-requisito para Decision Trees
stringIndexer = StringIndexer(inputCol="rotulo", outputCol="indexed")
si_model = stringIndexer.fit(scaledData)
obj_final = si_model.transform(scaledData)

# Criando o modelo
rfClassifer = RandomForestClassifier(labelCol="indexed",
                                     featuresCol="scaledFeatures",
                                     probabilityCol="probability",
                                     numTrees=20)
gbtClassifer = GBTClassifier(labelCol="rotulo", featuresCol="scaledFeatures")
(dados_treino, dados_teste) = obj_final.randomSplit([0.7, 0.3])

modelorf = rfClassifer.fit(dados_treino)
modelogbt = gbtClassifer.fit(dados_treino)

pred_rf = modelorf.transform(dados_teste)
pred_gbt = modelogbt.transform(dados_teste)


def mont_feat(pred1, pred2):
    predict = [
        pred1['probability'][0], pred1['probability'][1],
        pred2['probability'][0], pred2['probability'][1]
Example #32
0
def create_pipeline(columns):
    assembler = VectorAssembler(inputCols=columns, outputCol="features")
    labelIndexer = StringIndexer(inputCol="stars", outputCol="label")
    rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=30, maxDepth=10)
    return Pipeline(stages=[assembler, labelIndexer, rf])
Example #33
0
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],
                            outputCol='features')

# COMMAND ----------

fit_data = assembler.transform(data)

# COMMAND ----------

fit_data.show(5)

# COMMAND ----------

rfc = RandomForestClassifier(featuresCol='features',
                             labelCol='Spoiled',
                             numTrees=100)

# COMMAND ----------

final_data = fit_data.select('features', 'Spoiled')

# COMMAND ----------

final_data.show(5)

# COMMAND ----------

rfc = RandomForestClassifier(labelCol='Spoiled')

# COMMAND ----------
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print "DecisionTree Test set accuracy =  " + str(accuracy) + "\n"

#treeModel = model.stages[2]
# summary only
#print(treeModel)

"""
Random Forest
"""
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

# Train model.  This also runs the indexers.
model = rf.fit(train)

# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
#predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
#rfModel = model.stages[2]
nb = NaiveBayes(labelCol="label", featuresCol="features")
nbStages = dataPrepStages + [nb]

nbPipeline = Pipeline(stages=nbStages)

# COMMAND ----------

# MAGIC %md
# MAGIC We do the same for a Random Forest model.

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features")
rfStages = dataPrepStages + [rf]

rfPipeline = Pipeline(stages=rfStages)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Define Experiment Parameters

# COMMAND ----------

# MAGIC %md
# MAGIC Some parameters are going to be used across all algorithms.
# MAGIC We set the evaluator (default for `BinaryClassificationEvaluator` is AUC) and the number of folds for our cross-validation (k=10).
# MAGIC Again, these parameters are going to be applied to all the algorithms we train.
Example #36
0
assembler = VectorAssembler(inputCols=namesH, outputCol="features")

higgs = assembler.transform(higgsRaw.dropna())
higgsData = higgs.select('features', 'label')

print("vectorised:  ")

higgsData.show(10)
higgsData.printSchema()

(trainingData, testData) = higgsData.randomSplit([0.7, 0.3], 42)

rfc = RandomForestClassifier(labelCol="label",
                             featuresCol="features",
                             numTrees=3,
                             maxBins=20,
                             maxDepth=7,
                             featureSubsetStrategy="all")

clasEv = BinaryClassificationEvaluator()

mClasEv = MulticlassClassificationEvaluator(metricName="accuracy")

rfcModel = rfc.fit(trainingData)

pred = rfcModel.transform(testData)

AUC = clasEv.evaluate(pred)
accuracy = mClasEv.evaluate(pred)

print("AUC: ", AUC)
    print df_proper.printSchema()

    labelIndexer = StringIndexer(inputCol='Churn', outputCol='label')

    assembler = VectorAssembler(inputCols=[
        "SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"
    ],
                                outputCol="features")

    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=4)

    (train, test) = df_proper.randomSplit([0.7, 0.3])

    classifier = RandomForestClassifier(labelCol='label',
                                        featuresCol='features')

    pipeline = Pipeline(stages=[labelIndexer, assembler, classifier])

    model = pipeline.fit(train)

    predictions = model.transform(test)

    evaluator = BinaryClassificationEvaluator()

    auroc = evaluator.evaluate(predictions,
                               {evaluator.metricName: "areaUnderROC"})
    test = int(auroc)
    print auroc

    f = open(sys.argv[2], 'w')
# Check out the features
final_vectorized_features.show()

#
# Cross validate, train and evaluate classifier
#

# Test/train split
training_data, test_data = final_vectorized_features.randomSplit([0.7, 0.3])

# Instantiate and fit random forest classifier
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(
  featuresCol="Features_vec",
  labelCol="ArrDelayBucket",
  maxBins=4657,
  maxMemoryInMB=1024
)
model = rfc.fit(training_data)

# Evaluate model using test data
predictions = model.transform(test_data)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="ArrDelayBucket", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = {}".format(accuracy))

# Check a sample
predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
Example #39
0
exploder = TileExploder()
exploded_tiles = exploder.transform(df_labeled)

exploded_train, exploded_test = exploded_tiles.randomSplit([0.8, 0.2], seed=42)
noDataFilter = NoDataFilter().setInputCols(['label', 'blue', 'green', 'red', 'NIR', 'SWIR1', 'SWIR2', 'brightness'])

exploded_tiles_filtered_train = noDataFilter.transform(exploded_train)
exploded_tiles_filtered_test = noDataFilter.transform(exploded_test)

assembler = VectorAssembler().setInputCols(bands) \
                                 .setOutputCol("features")

assembled_df_train = assembler.transform(exploded_tiles_filtered_train)
assembled_df_test = assembler.transform(exploded_tiles_filtered_test)

classifier = RandomForestClassifier().setLabelCol('label') \
                        .setFeaturesCol(assembler.getOutputCol())

model = classifier.fit(assembled_df_train.cache())

prediction_df = model.transform(assembled_df_test).drop(assembler.getOutputCol()).cache()

evaluator = MulticlassClassificationEvaluator(
                predictionCol=classifier.getPredictionCol(),
                labelCol=classifier.getLabelCol(),
                metricName='accuracy'
)

accuracy = evaluator.evaluate(prediction_df)
print("\nAccuracy:", accuracy)

cnf_mtrx = prediction_df.groupBy(classifier.getPredictionCol()) \
    def _train_model_spark(self, data):
        df = self._prepare_data_spark(data)
        input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                                self.TODAY_PRICE}))

        if self.ann_hidden_nodes_num is None:
            self.ann_hidden_nodes_num = input_num / 2 + 1
        ann_layers = [input_num,
                      # input_num / 3 * 2,
                      # input_num / 3,
                      self.ann_hidden_nodes_num,
                      2]

        self.logger.info('layer settings are {}'.format(ann_layers))
        self.logger.info('training method is {}'.format(self._train_method))
        self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
        if isinstance(self._train_method, dict):
            if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                self._model[self.CHANGE_AMOUNT].stop_server()
            self._model = {self.CHANGE_AMOUNT: None,
                           self.CHANGE_DIRECTION: None}

            if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                      maxIter=self.linear_regression_training_times,
                                      regParam=self.linear_regression_regularization_parameter,
                                      predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = lr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth,
                                            predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                                          num_workers=self.spark_worker_numbers,
                                                                          epoch=self.ann_epoch_number,
                                                                          featuresCol="features",
                                                                          labelCol=self.CHANGE_AMOUNT,
                                                                          predictionCol='AmountPrediction'
                                                                          )
                self._model[self.CHANGE_AMOUNT].fit(df)
            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

            if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
                lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                        maxIter=self.logistic_regression_training_times,
                                        regParam=self.linear_regression_regularization_parameter,
                                        predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = lr.fit(df)
            elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
                rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                             numTrees=self.random_forest_tree_number,
                                             maxDepth=self.random_forest_tree_max_depth,
                                             predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = rfc.fit(df)

            elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 2
                mlpc = MultilayerPerceptronClassifier(featuresCol="features",
                                                      labelCol=self.CHANGE_DIRECTION,
                                                      layers=ann_layers,
                                                      predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        else:
            if self._train_method == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
                                      regParam=self.linear_regression_regularization_parameter,
                                      maxIter=self.linear_regression_training_times)
                self._model = lr.fit(df)
            elif self._train_method == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
                                            predictionCol='prediction',
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth)
                self._model = rfr.fit(df)

            elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                if self._model is not None:
                    self._model.stop_server()
                self.logger.warn('layers are {}'.format(ann_layers))
                self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                      num_workers=self.spark_worker_numbers, epoch=100,
                                                      featuresCol="features", labelCol=self.TARGET_PRICE,
                                                      predictionCol='prediction'
                                                      )
                self._model.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        return self._model
    inputcols = ['Source','Side','Wind_Direction','Weather_Condition','Sunrise_Sunset','State','Timezone']
    indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in inputcols]
    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df)
    df = df.drop(*inputcols)
    return df

def transform(df):    
    cols = df.columns
    cols.remove('Severity')
    vecAssembler = VectorAssembler(inputCols=cols, outputCol="features")
    df_transformed = vecAssembler.transform(df)
    return df_transformed

def evaluate_model(df):
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='Severity')
    accuracy_rf = evaluator.evaluate(df)
    return accuracy_rf

preprocessed_df = preprocessing(accident_df)
indexed_df = indexing(preprocessed_df)
transformed_df = transform(indexed_df)
#Split data into Training and Testing
train, test = transformed_df.randomSplit([0.7, 0.3], seed = 2000)
#Using Random Forest Algorithm
rf = RF(featuresCol='features',numTrees=12, maxDepth=16, labelCol="Severity",maxBins=150)
model_rf = rf.fit(train)
#Predicting on test data
prediction_rf = model_rf.transform(test)
accuracy = evaluate_model(prediction_rf)
print("Accuracy is ",accuracy)
Example #42
0
def test_pyspark_classifier_decision_tree():
    try:
        import pyspark
        import sklearn.datasets
        from pyspark.sql import SparkSession
        from pyspark import SparkContext, SparkConf
        from pyspark.ml.feature import VectorAssembler, StringIndexer
        from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
        import pandas as pd
    except:
        print("Skipping test_pyspark_classifier_decision_tree!")
        return
    import shap

    iris_sk = sklearn.datasets.load_iris()
    iris = pd.DataFrame(data=np.c_[iris_sk['data'], iris_sk['target']],
                        columns=iris_sk['feature_names'] + ['target'])[:100]
    spark = SparkSession.builder.config(
        conf=SparkConf().set("spark.master", "local[*]")).getOrCreate()

    col = [
        "sepal_length", "sepal_width", "petal_length", "petal_width", "type"
    ]
    iris = spark.createDataFrame(iris, col)
    iris = VectorAssembler(inputCols=col[:-1],
                           outputCol="features").transform(iris)
    iris = StringIndexer(inputCol="type",
                         outputCol="label").fit(iris).transform(iris)

    classifiers = [
        GBTClassifier(labelCol="label", featuresCol="features"),
        RandomForestClassifier(labelCol="label", featuresCol="features"),
        DecisionTreeClassifier(labelCol="label", featuresCol="features")
    ]
    for classifier in classifiers:
        model = classifier.fit(iris)
        explainer = shap.TreeExplainer(model)
        X = pd.DataFrame(data=iris_sk.data,
                         columns=iris_sk.feature_names)[:100]  # pylint: disable=E1101

        shap_values = explainer.shap_values(X)
        expected_values = explainer.expected_value

        predictions = model.transform(iris).select("rawPrediction")\
            .rdd.map(lambda x:[float(y) for y in x['rawPrediction']]).toDF(['class0','class1']).toPandas()

        if str(type(model)).endswith("GBTClassificationModel'>"):
            diffs = expected_values + shap_values.sum(1) - predictions.class1
            assert np.max(
                np.abs(diffs)
            ) < 1e-4, "SHAP values don't sum to model output for class0!"
        else:
            normalizedPredictions = (predictions.T / predictions.sum(1)).T
            diffs = expected_values[0] + shap_values[0].sum(
                1) - normalizedPredictions.class0
            assert np.max(
                np.abs(diffs)
            ) < 1e-4, "SHAP values don't sum to model output for class0!" + model
            diffs = expected_values[1] + shap_values[1].sum(
                1) - normalizedPredictions.class1
            assert np.max(
                np.abs(diffs)
            ) < 1e-4, "SHAP values don't sum to model output for class1!" + model
            assert (np.abs(expected_values - normalizedPredictions.mean()) <
                    1e-1).all(), "Bad expected_value!" + model
    spark.stop()
Example #43
0
#             [   2.,    2.,    1.,    8.,  197.,    0.,    0.,    2.,    3.,
#                 1.],
#             [   1.,    0.,    1.,    0.,    2.,  183.,    0.,    1.,    0.,
#                 1.],
#             [   1.,    0.,    0.,    0.,    0.,    0.,  192.,    1.,    1.,
#                 0.],
#             [   0.,    0.,    0.,    0.,    0.,    0.,    1.,  187.,    5.,
#                 0.],
#             [   0.,    1.,    2.,    0.,    0.,    0.,    1.,    5.,  172.,
#                 4.],
#             [   0.,    0.,    0.,    0.,    3.,    0.,    0.,    2.,    2.,
#               176.]])

#section 8.3.2
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(maxDepth=20)
rfmodel = rf.fit(pendttrain)
# RandomForestModel doesn't expose trees field in Python
rfpredicts = rfmodel.transform(pendtvalid)
rfresrdd = rfpredicts.select("prediction", "label").map(lambda row:  (row.prediction, row.label))
rfmm = MulticlassMetrics(rfresrdd)
rfmm.precision()
#0.9894640403114979
print(rfmm.confusionMatrix())
#DenseMatrix([[ 211.,    0.,    1.,    0.,    0.,    0.,    0.,    0.,    0.,
#                 0.],
#             [   0.,  220.,    0.,    1.,    0.,    0.,    0.,    0.,    0.,
#                 0.],
#             [   0.,    1.,  211.,    0.,    0.,    0.,    0.,    0.,    0.,
#                 0.],
#             [   0.,    0.,    0.,  175.,    1.,    0.,    0.,    0.,    0.,