Example #1
0
 def test_save_and_load_on_nested_list_params(self):
     temp_path = tempfile.mkdtemp()
     splitsArray = [
         [-float("inf"), 0.5, 1.4, float("inf")],
         [-float("inf"), 0.1, 1.2, float("inf")],
     ]
     bucketizer = Bucketizer(splitsArray=splitsArray,
                             inputCols=["values", "values"],
                             outputCols=["b1", "b2"])
     savePath = temp_path + "/bk"
     bucketizer.write().overwrite().save(savePath)
     loadedBucketizer = Bucketizer.load(savePath)
     assert loadedBucketizer.getSplitsArray() == splitsArray
Example #2
0
def main(base_path):
    APP_NAME = "train_spark_mllib_model.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),
        StructField("CRSArrTime", TimestampType(), True),
        StructField("CRSDepTime", TimestampType(), True),
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Route", StringType(), True),
        StructField("TailNum", StringType(), True),
        StructField("EngineManufacturer", StringType(), True),
        StructField("EngineModel", StringType(), True),
        StructField("Manufacturer", StringType(), True),
        StructField("ManufacturerYear", StringType(), True),
        StructField("OwnerState", StringType(), True),
    ])

    input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Add the hour of day of scheduled arrival/departure
    #
    from pyspark.sql.functions import hour
    features_with_hour = features.withColumn("CRSDepHourOfDay",
                                             hour(features.CRSDepTime))
    features_with_hour = features_with_hour.withColumn(
        "CRSArrHourOfDay", hour(features.CRSArrTime))
    features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime",
                              "CRSArrHourOfDay").show()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [
        (column,
         features_with_hour.where(features_with_hour[column].isNull()).count())
        for column in features_with_hour.columns
    ]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print("\nNull Value Report")
    print("-----------------")
    print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the model
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Apply the model
    ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"]
    for column in string_columns:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = [
        "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay",
        "CRSArrHourOfDay"
    ]
    index_columns = [column + "_index" for column in string_columns]

    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    #
    # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
    #

    from collections import defaultdict
    scores = defaultdict(list)
    feature_importances = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("\nRun {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        # Test/train split
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # Instantiate and fit random forest classifier on all the data
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4896,
        )
        model = rfc.fit(training_data)

        # Save the new model over the old one
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # Evaluate model using test data
        predictions = model.transform(test_data)

        # Evaluate this split's results for each metric
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:
            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

        #
        # Collect feature importances
        #
        feature_names = vector_assembler.getInputCols()
        feature_importance_list = model.featureImportances
        for feature_name, feature_importance in zip(feature_names,
                                                    feature_importance_list):
            feature_importances[feature_name].append(feature_importance)

    #
    # Evaluate average and STD of each metric and print a table
    #
    import numpy as np
    score_averages = defaultdict(float)

    # Compute the table data
    average_stds = []  # ha
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)

        average_stds.append((metric_name, average_accuracy, std_accuracy))

    # Print the table
    print("\nExperiment Log")
    print("--------------")
    print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))

    #
    # Persist the score to a sccore log that exists between runs
    #
    import pickle

    # Load the score log or initialize an empty one
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    # Compute the existing score log entry
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # Compute and display the change in score for each metric
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    experiment_report = []
    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        experiment_report.append((metric_name, run_delta))

    print("\nExperiment Report")
    print("-----------------")
    print(tabulate(experiment_report, headers=["Metric", "Score"]))

    # Append the existing average scores to the log
    score_log.append(score_log_entry)

    # Persist the log for next run
    pickle.dump(score_log, open(score_log_filename, "wb"))

    #
    # Analyze and report feature importance changes
    #

    # Compute averages for each feature
    feature_importance_entry = defaultdict(float)
    for feature_name, value_list in feature_importances.items():
        average_importance = sum(value_list) / len(value_list)
        feature_importance_entry[feature_name] = average_importance

    # Sort the feature importances in descending order and print
    import operator
    sorted_feature_importances = sorted(feature_importance_entry.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)

    print("\nFeature Importances")
    print("-------------------")
    print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))

    #
    # Compare this run's feature importances with the previous run's
    #

    # Load the feature importance log or initialize an empty one
    try:
        feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
        feature_log = pickle.load(open(feature_log_filename, "rb"))
        if not isinstance(feature_log, list):
            feature_log = []
    except IOError:
        feature_log = []

    # Compute and display the change in score for each feature
    try:
        last_feature_log = feature_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_feature_log = defaultdict(float)
        for feature_name, importance in feature_importance_entry.items():
            last_feature_log[feature_name] = importance

    # Compute the deltas
    feature_deltas = {}
    for feature_name in feature_importances.keys():
        run_delta = feature_importance_entry[feature_name] - last_feature_log[
            feature_name]
        feature_deltas[feature_name] = run_delta

    # Sort feature deltas, biggest change first
    import operator
    sorted_feature_deltas = sorted(feature_deltas.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # Display sorted feature deltas
    print("\nFeature Importance Delta Report")
    print("-------------------------------")
    print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))

    # Append the existing average deltas to the log
    feature_log.append(feature_importance_entry)

    # Persist the log for next run
    pickle.dump(feature_log, open(feature_log_filename, "wb"))
def main(base_path):

    # Default to "."
    try:
        base_path
    except NameError:
        base_path = "."
    if not base_path:
        base_path = "."

    APP_NAME = "train_spark_mllib_model.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except (NameError, UnboundLocalError) as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls))

    #
    # Add a Route variable to replace FlightNum
    #
    from pyspark.sql.functions import lit, concat
    features_with_route = features.withColumn(
        'Route', concat(features.Origin, lit('-'), features.Dest))
    features_with_route.show(6)

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Apply the bucketizer
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Drop the original column
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = ["DepDelay", "Distance"]
    index_columns = [
        "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
        "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index",
        "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(featuresCol="Features_vec",
                                 labelCol="ArrDelayBucket",
                                 predictionCol="Prediction",
                                 maxBins=4657,
                                 maxMemoryInMB=1024)
    model = rfc.fit(final_vectorized_features)

    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    model.write().overwrite().save(model_output_path)

    # Evaluate model using test data
    predictions = model.transform(final_vectorized_features)

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction",
                                                  labelCol="ArrDelayBucket",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = {}".format(accuracy))

    # Check the distribution of predictions
    predictions.groupBy("Prediction").count().show()

    # Check a sample
    predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(base_path):

    # 기본값은 "."
    try:
        base_path
    except NameError:
        base_path = "."
    if not base_path:
        base_path = "."

    APP_NAME = "train_spark_mllib_model.py"

    # SparkSession이 없으면 환경 생성
    try:
        sc and spark
    except (NameError, UnboundLocalError) as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Spark ML 사용하기 전 특징에 널 값 확인
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls))

    #
    # FlightNum을 대체할 Route 변수 추가
    #
    from pyspark.sql.functions import lit, concat
    features_with_route = features.withColumn(
        'Route', concat(features.Origin, lit('-'), features.Dest))
    features_with_route.show(6)

    #
    # pysmark.ml.feature.Bucketizer를 사용해 ArrDelay를 on-time(정시 도착), slightly late(약간 늦음), very late(매우 늦음) (0, 1, 2)으로 구간화
    #
    from pyspark.ml.feature import Bucketizer

    # 구간화 모델 설정
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # 모델 저장
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # 모델 적용
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    #  pyspark.ml.feature에 포함된 특징 도구 임포트
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # 범주 필드를 인덱스로 전환
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # 원래 열을 제거
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # 파이프라인 모델 저장
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # 연속 숫자 필드를 하나의 특징 벡터로 결합해서 처리
    numeric_columns = [
        "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear"
    ]
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # 숫자 벡터 어셈블러 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 인덱스 열 제거
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # 완성된 특징을 검사
    final_vectorized_features.show()

    # 모든 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시킴
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(featuresCol="Features_vec",
                                 labelCol="ArrDelayBucket",
                                 predictionCol="Prediction",
                                 maxBins=4657,
                                 maxMemoryInMB=1024)
    model = rfc.fit(final_vectorized_features)

    # 예전 모델 대신 새 모델 저장
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    model.write().overwrite().save(model_output_path)

    # 테스트 데이터로 모델 평가
    predictions = model.transform(final_vectorized_features)

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction",
                                                  labelCol="ArrDelayBucket",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = {}".format(accuracy))

    # 예측 분포 확인
    predictions.groupBy("Prediction").count().show()

    # 표본 확인
    predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
Example #5
0
def main(base_path):
    APP_NAME = "train_spark_mllib_model.py"

    # SparkSession이 없으면 환경 생성
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),
        StructField("CRSArrTime", TimestampType(), True),
        StructField("CRSDepTime", TimestampType(), True),
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Route", StringType(), True),
        StructField("TailNum", StringType(), True),
        StructField("EngineManufacturer", StringType(), True),
        StructField("EngineModel", StringType(), True),
        StructField("Manufacturer", StringType(), True),
        StructField("ManufacturerYear", StringType(), True),
        StructField("OwnerState", StringType(), True),
        StructField("FlightTime", IntegerType(), True),
    ])

    input_path = "{}/data/simple_flight_delay_features_flight_times.json".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # 예정된 도착/출발 시간 추가
    #
    from pyspark.sql.functions import hour
    features_with_hour = features.withColumn("CRSDepHourOfDay",
                                             hour(features.CRSDepTime))
    features_with_hour = features_with_hour.withColumn(
        "CRSArrHourOfDay", hour(features.CRSArrTime))
    features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime",
                              "CRSArrHourOfDay").show()

    #
    # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인
    #
    null_counts = [
        (column,
         features_with_hour.where(features_with_hour[column].isNull()).count())
        for column in features_with_hour.columns
    ]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print("\nNull Value Report")
    print("-----------------")
    print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))

    #
    # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화
    #
    from pyspark.ml.feature import Bucketizer

    # 구간화 모델 설정
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # 모델 저장
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # 모델 적용
    ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # pyspark.ml.feature의 특징 도구 임포트
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # 범주 필드를 인덱스로 전환
    string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"]
    for column in string_columns:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦
    numeric_columns = [
        "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay",
        "CRSArrHourOfDay", "FlightTime"
    ]
    index_columns = [column + "_index" for column in string_columns]

    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # 수치 벡터 어셈블러 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler_6.0.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 인덱스 열 제거
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # 확정된 특징 검사
    final_vectorized_features.show()

    #
    # 분류 모델 교차 검증, 훈련, 평가: 4개의 지표에 대해 5번 반복
    #

    from collections import defaultdict
    scores = defaultdict(list)
    feature_importances = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("\nRun {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        #  테스트/훈련 데이터 분할
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # 모든 데이터에 대해 랜덤 포레스트 분류 모델 인스턴스화 및 적합
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4896,
        )
        model = rfc.fit(training_data)

        # 새 모델을 이전 모델 위에 덮어쓰기
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.flight_time.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # 테스트 데이터로 모델 평가
        predictions = model.transform(test_data)

        # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:
            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

        #
        # 특징 중요도 수집
        #
        feature_names = vector_assembler.getInputCols()
        feature_importance_list = model.featureImportances
        for feature_name, feature_importance in zip(feature_names,
                                                    feature_importance_list):
            feature_importances[feature_name].append(feature_importance)

    #
    # 지표별 평균과 표준편차 평가 및 표로 출력
    #
    import numpy as np
    score_averages = defaultdict(float)

    # 표 데이터 계산
    average_stds = []  # ha
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)

        average_stds.append((metric_name, average_accuracy, std_accuracy))

    # 표 출력
    print("\nExperiment Log")
    print("--------------")
    print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))

    #
    # 점수를 실행 사이에 존재하는 점수 로그에 유지
    #
    import pickle

    # 점수 로그를 적재하거나 빈 로그를 초기화
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    # 기존 점수 로그 계산
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # 각 지표에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    experiment_report = []
    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        experiment_report.append((metric_name, run_delta))

    print("\nExperiment Report")
    print("-----------------")
    print(tabulate(experiment_report, headers=["Metric", "Score"]))

    # 기존 평균 점수를 로그에 추가
    score_log.append(score_log_entry)

    # 다음 번 실행을 위해 로그 유지
    pickle.dump(score_log, open(score_log_filename, "wb"))

    #
    # 특징 중요도의 변화를 분석하고 보고
    #

    # 각 특징에 대한 평균 계산
    feature_importance_entry = defaultdict(float)
    for feature_name, value_list in feature_importances.items():
        average_importance = sum(value_list) / len(value_list)
        feature_importance_entry[feature_name] = average_importance

    # 특징 중요도를 내림차순으로 정렬하고 출력
    import operator
    sorted_feature_importances = sorted(feature_importance_entry.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)

    print("\nFeature Importances")
    print("-------------------")
    print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))

    #
    # 이번 실행 결과인 특징 중요도와 이전 실행 결과와 비교
    #

    # 특징 중요도 로그를 적재하거나 빈 로그를 초기화
    try:
        feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
        feature_log = pickle.load(open(feature_log_filename, "rb"))
        if not isinstance(feature_log, list):
            feature_log = []
    except IOError:
        feature_log = []

    # 각 특징에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_feature_log = feature_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_feature_log = defaultdict(float)
        for feature_name, importance in feature_importance_entry.items():
            last_feature_log[feature_name] = importance

    # 변동 값(delta) 계산
    feature_deltas = {}
    for feature_name in feature_importances.keys():
        run_delta = feature_importance_entry[feature_name] - last_feature_log[
            feature_name]
        feature_deltas[feature_name] = run_delta

    # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다
    import operator
    sorted_feature_deltas = sorted(feature_deltas.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # 정렬된 특징 변동 값 디스플레이
    print("\nFeature Importance Delta Report")
    print("-------------------------------")
    print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))

    # 로그에 기존 평균 변동 값을 추가
    feature_log.append(feature_importance_entry)

    # 다음 실행을 위해 로그 유지
    pickle.dump(feature_log, open(feature_log_filename, "wb"))
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
Example #7
0
def main():
    #데이터 가져오기
    refined_data = spark.read.parquet("{}/data/*".format(project_home))

    # 테이블 등록
    refined_data.registerTempTable("Refined_Data")

    # 모델 훈련에 쓰일 데이터 생성
    training_yet_data = spark.sql("""
    SELECT
      FlightNum,
      FlightDate,
      DayOfWeek,
      DayofMonth AS DayOfMonth,
      CONCAT(Month, '-',  DayofMonth) AS DayOfYear,
      Carrier,
      Origin,
      Dest,
      Distance,
      DepDelay,
      ArrDelay,
      CRSDepTime,
      CRSArrTime
    FROM Refined_Data
    """)

    # alter_feature_datetimes : 날짜 파싱
    training_yet_data = training_yet_data.rdd.map(
        alter_feature_datetimes).toDF()

    # 항공편 번호를 운항 경로로 대체하기
    from pyspark.sql.functions import lit, concat

    features_with_route = training_yet_data.withColumn(
        'Route',
        concat(training_yet_data.Origin, lit('-'), training_yet_data.Dest))

    #### Bucketizer:목표변수 분류 클래스 나누기 ####
    from pyspark.ml.feature import Bucketizer

    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    bucketizer = Bucketizer(
        splits=splits,
        inputCol="ArrDelay",  #원시 목표변수
        outputCol="ArrDelayBucket"  #클래스 나뉜 목표변수
    )

    # Bucketizer 객체 저장
    bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        project_home)
    print(bucketizer_path)
    bucketizer.write().overwrite().save(bucketizer_path)

    # Bucketizer로 데이터 변환
    ml_bucketized_features = bucketizer.transform(features_with_route)

    #### StringIndexer : String 타입의 범주 값을 해당 값의 정수 번호로 변환 ####
    from pyspark.ml.feature import StringIndexer

    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")
        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        ml_bucketized_features = ml_bucketized_features.drop(column)

        # StringIndexer 객체 저장
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            project_home, column)
        print(string_indexer_output_path)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    #### VectorAssembler: 데이터를 벡터화 하기 ####
    from pyspark.ml.feature import VectorAssembler

    numeric_columns = [
        "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear"
    ]
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    training_data = vector_assembler.transform(ml_bucketized_features)

    # VectorAssembler 객체 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        project_home)
    print(vector_assembler_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 필요없는 컬럼 제거
    for column in index_columns:
        training_data = training_data.drop(column)

    # 모델 : 랜덤포레스트
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(featuresCol="Features_vec",
                                 labelCol="ArrDelayBucket",
                                 maxBins=4657,
                                 maxMemoryInMB=1024,
                                 numTrees=10,
                                 maxDepth=10)

    # 훈련시작
    model = rfc.fit(training_data)

    # 모델 객체 저장
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        project_home)
    print(model_output_path)
    model.write().overwrite().save(model_output_path)
Example #8
0
def main(base_path):

    # Default to "."
    try:
        base_path
    except NameError:
        base_path = "."
    if not base_path:
        base_path = "."

    from pyspark.sql import SparkSession

    # Initialize PySpark with MongoDB support
    APP_NAME = "Deploying Predictive Systems in Realtime"
    spark = (
        SparkSession.builder.appName(APP_NAME)
        # Load support for MongoDB and Elasticsearch
        .config(
            "spark.jars.packages",
            "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,org.elasticsearch:elasticsearch-spark-30_2.12:7.14.2",
        )
        # Add Configuration for MongopDB
        .config("spark.mongodb.input.uri",
                "mongodb://mongo:27017/test.coll").config(
                    "spark.mongodb.output.uri",
                    "mongodb://mongo:27017/test.coll").getOrCreate())
    sc = spark.sparkContext
    sc.setLogLevel("ERROR")

    print("\nPySpark initialized...")

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import (
        StringType,
        IntegerType,
        FloatType,
        DoubleType,
        DateType,
        TimestampType,
    )
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls))

    #
    # Add a Route variable to replace FlightNum
    #
    from pyspark.sql.functions import lit, concat

    features_with_route = features.withColumn(
        "Route", concat(features.Origin, lit("-"), features.Dest))
    features_with_route.show(6)

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Setup the Departure Bucketizer for other examples
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    departure_bucketizer = Bucketizer(splits=splits,
                                      inputCol="DepDelay",
                                      outputCol="DepDelayBucket")

    # Save the departure bucketizer
    departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format(
        base_path)
    departure_bucketizer.write().overwrite().save(departure_bucketizer_path)

    # Apply the bucketizer
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Drop the original column
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = [
        "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear"
    ]
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier

    rfc = RandomForestClassifier(
        featuresCol="Features_vec",
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        maxBins=4657,
        maxMemoryInMB=1024,
    )
    model = rfc.fit(final_vectorized_features)

    # Save the new model over the old one
    model_output_path = (
        "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".
        format(base_path))
    model.write().overwrite().save(model_output_path)

    # Evaluate model using test data
    predictions = model.transform(final_vectorized_features)

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator

    evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction",
                                                  labelCol="ArrDelayBucket",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = {}".format(accuracy))

    # Check the distribution of predictions
    predictions.groupBy("Prediction").count().show()

    # Check a sample
    predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(iso_date, base_path):
  
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()

  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),     # "ArrDelay":5.0
    StructField("CRSArrTime", TimestampType(), True),    # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
    StructField("CRSDepTime", TimestampType(), True),    # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
    StructField("Carrier", StringType(), True),     # "Carrier":"WN"
    StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31
    StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
    StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
    StructField("DepDelay", DoubleType(), True),     # "DepDelay":14.0
    StructField("Dest", StringType(), True),        # "Dest":"SAN"
    StructField("Distance", DoubleType(), True),     # "Distance":368.0
    StructField("FlightDate", DateType(), True),    # "FlightDate":"2015-12-30T16:00:00.000-08:00"
    StructField("FlightNum", StringType(), True),   # "FlightNum":"6109"
    StructField("Origin", StringType(), True),      # "Origin":"TUS"
  ])
  
  input_path = "{}/data/simple_flight_delay_features.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print(list(cols_with_nulls))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  splits = [-float("inf"), 15.0, 60.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  ml_bucketized_features = arrival_bucketizer.transform(features)
  arrival_bucketizer_path = "{}/models/arrival_bucketizer.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  departure_bucketizer = Bucketizer(
    splits=splits,
    inputCol="DepDelay",
    outputCol="DepDelayBucket"
  )
  ml_bucketized_features = departure_bucketizer.transform(ml_bucketized_features)
  departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format(base_path)
  departure_bucketizer.write().overwrite().save(departure_bucketizer_path)
  
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket", "DepDelay", "DepDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml import Pipeline
  from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer
  from pyspark.ml.feature import VectorAssembler
  
  # Turn category fields into categoric feature vectors, then drop intermediate fields
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "FlightNum", "DepDelayBucket"]:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    one_hot_encoder = OneHotEncoder(
      dropLast=False,
      inputCol=column + "_index",
      outputCol=column + "_vec"
    )
    string_pipeline = Pipeline(stages=[string_indexer, one_hot_encoder])
    
    string_pipeline_model = string_pipeline.fit(ml_bucketized_features)
    ml_bucketized_features = string_pipeline_model.transform(ml_bucketized_features)
    
    ml_bucketized_features = ml_bucketized_features.drop(column).drop(column + "_index")
    
    # Save the pipeline model
    string_pipeline_output_path = "{}/models/string_indexer_pipeline_model_{}.bin".format(
      base_path,
      column
    )
    string_pipeline_model.write().overwrite().save(string_pipeline_output_path)
  
  # Handle continuous, numeric fields by combining them into one feature vector
  numeric_columns = ["DepDelay", "Distance"]
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns,
    outputCol="NumericFeatures_vec"
  )
  ml_bucketized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the original columns
  for column in numeric_columns:
    ml_bucketized_features = ml_bucketized_features.drop(column)
  
  # Combine various features into one feature vector, 'features'
  feature_columns = ["Carrier_vec", "DayOfMonth_vec", "DayOfWeek_vec", "DayOfYear_vec",
                     "Origin_vec", "Dest_vec", "FlightNum_vec", "DepDelayBucket_vec",
                     "NumericFeatures_vec"]
  final_assembler = VectorAssembler(
      inputCols=feature_columns,
      outputCol="Features_vec"
  )
  final_vectorized_features = final_assembler.transform(ml_bucketized_features)
  for column in feature_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Save the final assembler
  final_assembler_path = "{}/models/final_vector_assembler.bin".format(base_path)
  final_assembler.write().overwrite().save(final_assembler_path)
  
  # Inspect the finalized features
  final_vectorized_features = final_vectorized_features.limit(100000) # remove me, I am for the author's development
  final_vectorized_features.show()
  
  # Instantiate and fit random forest classifier on all the data
  from pyspark.ml.classification import RandomForestClassifier
  rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction")
  model = rfc.fit(final_vectorized_features)
  
  # Save the new model over the old one
  model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.bin".format(
    base_path
  )
  model.write().overwrite().save(model_output_path)
Example #10
0
def main(base_path):
  
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()

  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),     # "ArrDelay":5.0
    StructField("CRSArrTime", TimestampType(), True),    # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
    StructField("CRSDepTime", TimestampType(), True),    # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
    StructField("Carrier", StringType(), True),     # "Carrier":"WN"
    StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31
    StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
    StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
    StructField("DepDelay", DoubleType(), True),     # "DepDelay":14.0
    StructField("Dest", StringType(), True),        # "Dest":"SAN"
    StructField("Distance", DoubleType(), True),     # "Distance":368.0
    StructField("FlightDate", DateType(), True),    # "FlightDate":"2015-12-30T16:00:00.000-08:00"
    StructField("FlightNum", StringType(), True),   # "FlightNum":"6109"
    StructField("Origin", StringType(), True),      # "Origin":"TUS"
  ])
  
  input_path = "{}/data/simple_flight_delay_features.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print(list(cols_with_nulls))
  
  #
  # Add a Route variable to replace FlightNum
  #
  from pyspark.sql.functions import lit, concat
  features_with_route = features.withColumn(
    'Route',
    concat(
      features.Origin,
      lit('-'),
      features.Dest
    )
  )
  features_with_route.show(6)
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer

  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )

  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Drop the original column
    ml_bucketized_features = ml_bucketized_features.drop(column)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Handle continuous, numeric fields by combining them into one feature vector
  numeric_columns = ["DepDelay", "Distance"]
  index_columns = ["Carrier_index", "DayOfMonth_index",
                   "DayOfWeek_index", "DayOfYear_index", "Origin_index",
                   "Origin_index", "Dest_index", "Route_index"]
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)

  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #

  from collections import defaultdict
  scores = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3

  for i in range(1, split_count + 1):
    print("Run {} out of {} of test/train splits in cross validation...".format(
        i,
        split_count,
      )
    )
  
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
  
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4657,
    )
    model = rfc.fit(training_data)
  
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
  
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)

      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))

  #
  # Evaluate average and STD of each metric
  #
  import numpy as np
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    print("AVG {} = {:.3f}".format(metric_name, average_accuracy))
  
    std_accuracy = np.std(metric_scores)
    print("STD {} = {:.3f}".format(metric_name, std_accuracy))

  #
  # Evaluate average and STD of each metric
  #
  import numpy as np
  score_averages = defaultdict(float)

  for metric_name in metric_names:
    metric_scores = scores[metric_name]
  
    average_accuracy = sum(metric_scores) / len(metric_scores)
    print("AVG {} = {:.4f}".format(metric_name, average_accuracy))
    score_averages[metric_name] = average_accuracy
  
    std_accuracy = np.std(metric_scores)
    print("STD {} = {:.4f}".format(metric_name, std_accuracy))

  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle

  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []

  # Compute the existing score log entry
  score_log_entry = {metric_name: score_averages[metric_name] for metric_name in metric_names}

  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry

  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    print("{} delta: {:.4f}".format(metric_name, run_delta))

  # Append the existing average scores to the log
  score_log.append(score_log_entry)

  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
def main(base_path):

    APP_NAME = "train_spark_mllib_model.py"

    # SparkSession이 없으면 환경 설정
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.json".format(base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls))

    #
    # FlightNum을 대체할 Route 변수 추가
    #
    from pyspark.sql.functions import lit, concat
    features_with_route = features.withColumn(
        'Route', concat(features.Origin, lit('-'), features.Dest))
    features_with_route.show(6)

    #
    # pysmark.ml.feature.Bucketizer을 사용해 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화
    #
    from pyspark.ml.feature import Bucketizer

    # 구간화 모델 설정
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # 모델 저장
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # 모델 적용
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # pyspark.ml.feature의 특징 도구 임포트
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # 범주 필드를 인덱스로 전환
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # 원래 열을 제거
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # 파이프라인 모델을 저장
        string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦
    numeric_columns = ["DepDelay", "Distance"]
    index_columns = [
        "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
        "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index",
        "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # 숫자 벡터 어셈블러를 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 인덱스 열 제거
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # 확정된 특징 검사
    final_vectorized_features.show()

    #
    # 분류 모델을 교차 검증, 훈련, 평가: 4개의 지표에 대해 5회 반복
    #

    from collections import defaultdict
    scores = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("Run {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        # 테스트 데이터/훈련 데이터 분할
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # 전체 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시킴
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4657,
        )
        model = rfc.fit(training_data)

        # 예전 모델 대신 새 모델을 저장
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # 테스트 데이터를 사용해서 모델을 평가
        predictions = model.transform(test_data)

        # 각 지표에 대해 이 분할된 데이터의 결과를 평가
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:

            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

    #
    # 각 지표의 평균과 표준편차를 평가
    #
    import numpy as np
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        print("AVG {} = {:.3f}".format(metric_name, average_accuracy))

        std_accuracy = np.std(metric_scores)
        print("STD {} = {:.3f}".format(metric_name, std_accuracy))

    #
    # 각 지표의 평균과 표준편차를 평가
    #
    import numpy as np
    score_averages = defaultdict(float)

    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        print("AVG {} = {:.4f}".format(metric_name, average_accuracy))
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)
        print("STD {} = {:.4f}".format(metric_name, std_accuracy))

    #
    # 점수를 실행 사이에 존재하는 점수 로그에 유지
    #
    import pickle

    # 점수 로그를 적재하거나 빈 로그를 초기화
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    #  기존 점수 로그 계산
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # 각 지표에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        print("{} delta: {:.4f}".format(metric_name, run_delta))

    # 기존 평균 점수를 로그에 추가
    score_log.append(score_log_entry)

    #  다음 번 실행을 위해 로그 유지
    pickle.dump(score_log, open(score_log_filename, "wb"))