def test_save_and_load_on_nested_list_params(self): temp_path = tempfile.mkdtemp() splitsArray = [ [-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.1, 1.2, float("inf")], ] bucketizer = Bucketizer(splitsArray=splitsArray, inputCols=["values", "values"], outputCols=["b1", "b2"]) savePath = temp_path + "/bk" bucketizer.write().overwrite().save(savePath) loadedBucketizer = Bucketizer.load(savePath) assert loadedBucketizer.getSplitsArray() == splitsArray
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn("CRSDepHourOfDay", hour(features.CRSDepTime)) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime)) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [ (column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns ] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay" ] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # Test/train split training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted(feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[ feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted(feature_deltas.items(), key=operator.itemgetter(1), reverse=True) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
def main(base_path): # Default to "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except (NameError, UnboundLocalError) as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat(features.Origin, lit('-'), features.Dest)) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the bucketizer ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = ["DepDelay", "Distance"] index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024) model = rfc.fit(final_vectorized_features) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # Check the distribution of predictions predictions.groupBy("Prediction").count().show() # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(base_path): # 기본값은 "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 생성 try: sc and spark except (NameError, UnboundLocalError) as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Spark ML 사용하기 전 특징에 널 값 확인 # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat(features.Origin, lit('-'), features.Dest)) features_with_route.show(6) # # pysmark.ml.feature.Bucketizer를 사용해 ArrDelay를 on-time(정시 도착), slightly late(약간 늦음), very late(매우 늦음) (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간화 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature에 포함된 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # 원래 열을 제거 ml_bucketized_features = ml_bucketized_features.drop(column) # 파이프라인 모델 저장 string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # 연속 숫자 필드를 하나의 특징 벡터로 결합해서 처리 numeric_columns = [ "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear" ] index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # 숫자 벡터 어셈블러 저장 vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 완성된 특징을 검사 final_vectorized_features.show() # 모든 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시킴 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024) model = rfc.fit(final_vectorized_features) # 예전 모델 대신 새 모델 저장 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) model.write().overwrite().save(model_output_path) # 테스트 데이터로 모델 평가 predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # 예측 분포 확인 predictions.groupBy("Prediction").count().show() # 표본 확인 predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), StructField("FlightTime", IntegerType(), True), ]) input_path = "{}/data/simple_flight_delay_features_flight_times.json".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # 예정된 도착/출발 시간 추가 # from pyspark.sql.functions import hour features_with_hour = features.withColumn("CRSDepHourOfDay", hour(features.CRSDepTime)) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime)) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인 # null_counts = [ (column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns ] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간화 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature의 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦 numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay", "FlightTime" ] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # 수치 벡터 어셈블러 저장 vector_assembler_path = "{}/models/numeric_vector_assembler_6.0.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # # 분류 모델 교차 검증, 훈련, 평가: 4개의 지표에 대해 5번 반복 # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # 테스트/훈련 데이터 분할 training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # 모든 데이터에 대해 랜덤 포레스트 분류 모델 인스턴스화 및 적합 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # 새 모델을 이전 모델 위에 덮어쓰기 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.flight_time.bin".format( base_path) model.write().overwrite().save(model_output_path) # 테스트 데이터로 모델 평가 predictions = model.transform(test_data) # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가 from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # 특징 중요도 수집 # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # 지표별 평균과 표준편차 평가 및 표로 출력 # import numpy as np score_averages = defaultdict(float) # 표 데이터 계산 average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # 표 출력 print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # 점수를 실행 사이에 존재하는 점수 로그에 유지 # import pickle # 점수 로그를 적재하거나 빈 로그를 초기화 try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # 기존 점수 로그 계산 score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # 각 지표에 대한 점수 변화를 계산하고 디스플레이 try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # 기존 평균 점수를 로그에 추가 score_log.append(score_log_entry) # 다음 번 실행을 위해 로그 유지 pickle.dump(score_log, open(score_log_filename, "wb")) # # 특징 중요도의 변화를 분석하고 보고 # # 각 특징에 대한 평균 계산 feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # 특징 중요도를 내림차순으로 정렬하고 출력 import operator sorted_feature_importances = sorted(feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # 이번 실행 결과인 특징 중요도와 이전 실행 결과와 비교 # # 특징 중요도 로그를 적재하거나 빈 로그를 초기화 try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # 각 특징에 대한 점수 변화를 계산하고 디스플레이 try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # 변동 값(delta) 계산 feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[ feature_name] feature_deltas[feature_name] = run_delta # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다 import operator sorted_feature_deltas = sorted(feature_deltas.items(), key=operator.itemgetter(1), reverse=True) # 정렬된 특징 변동 값 디스플레이 print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # 로그에 기존 평균 변동 값을 추가 feature_log.append(feature_importance_entry) # 다음 실행을 위해 로그 유지 pickle.dump(feature_log, open(feature_log_filename, "wb"))
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
def main(): #데이터 가져오기 refined_data = spark.read.parquet("{}/data/*".format(project_home)) # 테이블 등록 refined_data.registerTempTable("Refined_Data") # 모델 훈련에 쓰일 데이터 생성 training_yet_data = spark.sql(""" SELECT FlightNum, FlightDate, DayOfWeek, DayofMonth AS DayOfMonth, CONCAT(Month, '-', DayofMonth) AS DayOfYear, Carrier, Origin, Dest, Distance, DepDelay, ArrDelay, CRSDepTime, CRSArrTime FROM Refined_Data """) # alter_feature_datetimes : 날짜 파싱 training_yet_data = training_yet_data.rdd.map( alter_feature_datetimes).toDF() # 항공편 번호를 운항 경로로 대체하기 from pyspark.sql.functions import lit, concat features_with_route = training_yet_data.withColumn( 'Route', concat(training_yet_data.Origin, lit('-'), training_yet_data.Dest)) #### Bucketizer:목표변수 분류 클래스 나누기 #### from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", #원시 목표변수 outputCol="ArrDelayBucket" #클래스 나뉜 목표변수 ) # Bucketizer 객체 저장 bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( project_home) print(bucketizer_path) bucketizer.write().overwrite().save(bucketizer_path) # Bucketizer로 데이터 변환 ml_bucketized_features = bucketizer.transform(features_with_route) #### StringIndexer : String 타입의 범주 값을 해당 값의 정수 번호로 변환 #### from pyspark.ml.feature import StringIndexer for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) ml_bucketized_features = ml_bucketized_features.drop(column) # StringIndexer 객체 저장 string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( project_home, column) print(string_indexer_output_path) string_indexer_model.write().overwrite().save( string_indexer_output_path) #### VectorAssembler: 데이터를 벡터화 하기 #### from pyspark.ml.feature import VectorAssembler numeric_columns = [ "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear" ] index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") training_data = vector_assembler.transform(ml_bucketized_features) # VectorAssembler 객체 저장 vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( project_home) print(vector_assembler_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 필요없는 컬럼 제거 for column in index_columns: training_data = training_data.drop(column) # 모델 : 랜덤포레스트 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", maxBins=4657, maxMemoryInMB=1024, numTrees=10, maxDepth=10) # 훈련시작 model = rfc.fit(training_data) # 모델 객체 저장 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( project_home) print(model_output_path) model.write().overwrite().save(model_output_path)
def main(base_path): # Default to "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." from pyspark.sql import SparkSession # Initialize PySpark with MongoDB support APP_NAME = "Deploying Predictive Systems in Realtime" spark = ( SparkSession.builder.appName(APP_NAME) # Load support for MongoDB and Elasticsearch .config( "spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,org.elasticsearch:elasticsearch-spark-30_2.12:7.14.2", ) # Add Configuration for MongopDB .config("spark.mongodb.input.uri", "mongodb://mongo:27017/test.coll").config( "spark.mongodb.output.uri", "mongodb://mongo:27017/test.coll").getOrCreate()) sc = spark.sparkContext sc.setLogLevel("ERROR") print("\nPySpark initialized...") # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import ( StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType, ) from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( "Route", concat(features.Origin, lit("-"), features.Dest)) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Setup the Departure Bucketizer for other examples splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] departure_bucketizer = Bucketizer(splits=splits, inputCol="DepDelay", outputCol="DepDelayBucket") # Save the departure bucketizer departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format( base_path) departure_bucketizer.write().overwrite().save(departure_bucketizer_path) # Apply the bucketizer ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear" ] index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024, ) model = rfc.fit(final_vectorized_features) # Save the new model over the old one model_output_path = ( "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin". format(base_path)) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # Check the distribution of predictions predictions.groupBy("Prediction").count().show() # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(iso_date, base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer splits = [-float("inf"), 15.0, 60.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) ml_bucketized_features = arrival_bucketizer.transform(features) arrival_bucketizer_path = "{}/models/arrival_bucketizer.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) departure_bucketizer = Bucketizer( splits=splits, inputCol="DepDelay", outputCol="DepDelayBucket" ) ml_bucketized_features = departure_bucketizer.transform(ml_bucketized_features) departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format(base_path) departure_bucketizer.write().overwrite().save(departure_bucketizer_path) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket", "DepDelay", "DepDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorIndexer from pyspark.ml.feature import VectorAssembler # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "FlightNum", "DepDelayBucket"]: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) one_hot_encoder = OneHotEncoder( dropLast=False, inputCol=column + "_index", outputCol=column + "_vec" ) string_pipeline = Pipeline(stages=[string_indexer, one_hot_encoder]) string_pipeline_model = string_pipeline.fit(ml_bucketized_features) ml_bucketized_features = string_pipeline_model.transform(ml_bucketized_features) ml_bucketized_features = ml_bucketized_features.drop(column).drop(column + "_index") # Save the pipeline model string_pipeline_output_path = "{}/models/string_indexer_pipeline_model_{}.bin".format( base_path, column ) string_pipeline_model.write().overwrite().save(string_pipeline_output_path) # Handle continuous, numeric fields by combining them into one feature vector numeric_columns = ["DepDelay", "Distance"] vector_assembler = VectorAssembler( inputCols=numeric_columns, outputCol="NumericFeatures_vec" ) ml_bucketized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the original columns for column in numeric_columns: ml_bucketized_features = ml_bucketized_features.drop(column) # Combine various features into one feature vector, 'features' feature_columns = ["Carrier_vec", "DayOfMonth_vec", "DayOfWeek_vec", "DayOfYear_vec", "Origin_vec", "Dest_vec", "FlightNum_vec", "DepDelayBucket_vec", "NumericFeatures_vec"] final_assembler = VectorAssembler( inputCols=feature_columns, outputCol="Features_vec" ) final_vectorized_features = final_assembler.transform(ml_bucketized_features) for column in feature_columns: final_vectorized_features = final_vectorized_features.drop(column) # Save the final assembler final_assembler_path = "{}/models/final_vector_assembler.bin".format(base_path) final_assembler.write().overwrite().save(final_assembler_path) # Inspect the finalized features final_vectorized_features = final_vectorized_features.limit(100000) # remove me, I am for the author's development final_vectorized_features.show() # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction") model = rfc.fit(final_vectorized_features) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.bin".format( base_path ) model.write().overwrite().save(model_output_path)
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat( features.Origin, lit('-'), features.Dest ) ) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Handle continuous, numeric fields by combining them into one feature vector numeric_columns = ["DepDelay", "Distance"] index_columns = ["Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index"] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("Run {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Evaluate average and STD of each metric # import numpy as np for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) print("AVG {} = {:.3f}".format(metric_name, average_accuracy)) std_accuracy = np.std(metric_scores) print("STD {} = {:.3f}".format(metric_name, std_accuracy)) # # Evaluate average and STD of each metric # import numpy as np score_averages = defaultdict(float) for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) print("AVG {} = {:.4f}".format(metric_name, average_accuracy)) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) print("STD {} = {:.4f}".format(metric_name, std_accuracy)) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = {metric_name: score_averages[metric_name] for metric_name in metric_names} # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] print("{} delta: {:.4f}".format(metric_name, run_delta)) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb"))
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 설정 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.json".format(base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인 # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat(features.Origin, lit('-'), features.Dest)) features_with_route.show(6) # # pysmark.ml.feature.Bucketizer을 사용해 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간화 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature의 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # 원래 열을 제거 ml_bucketized_features = ml_bucketized_features.drop(column) # 파이프라인 모델을 저장 string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦 numeric_columns = ["DepDelay", "Distance"] index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # 숫자 벡터 어셈블러를 저장 vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # # 분류 모델을 교차 검증, 훈련, 평가: 4개의 지표에 대해 5회 반복 # from collections import defaultdict scores = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("Run {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # 테스트 데이터/훈련 데이터 분할 training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # 전체 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시킴 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, ) model = rfc.fit(training_data) # 예전 모델 대신 새 모델을 저장 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path) model.write().overwrite().save(model_output_path) # 테스트 데이터를 사용해서 모델을 평가 predictions = model.transform(test_data) # 각 지표에 대해 이 분할된 데이터의 결과를 평가 from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # 각 지표의 평균과 표준편차를 평가 # import numpy as np for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) print("AVG {} = {:.3f}".format(metric_name, average_accuracy)) std_accuracy = np.std(metric_scores) print("STD {} = {:.3f}".format(metric_name, std_accuracy)) # # 각 지표의 평균과 표준편차를 평가 # import numpy as np score_averages = defaultdict(float) for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) print("AVG {} = {:.4f}".format(metric_name, average_accuracy)) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) print("STD {} = {:.4f}".format(metric_name, std_accuracy)) # # 점수를 실행 사이에 존재하는 점수 로그에 유지 # import pickle # 점수 로그를 적재하거나 빈 로그를 초기화 try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # 기존 점수 로그 계산 score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # 각 지표에 대한 점수 변화를 계산하고 디스플레이 try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] print("{} delta: {:.4f}".format(metric_name, run_delta)) # 기존 평균 점수를 로그에 추가 score_log.append(score_log_entry) # 다음 번 실행을 위해 로그 유지 pickle.dump(score_log, open(score_log_filename, "wb"))