Esempio n. 1
0
 def test_save_and_load_on_nested_list_params(self):
     temp_path = tempfile.mkdtemp()
     splitsArray = [
         [-float("inf"), 0.5, 1.4, float("inf")],
         [-float("inf"), 0.1, 1.2, float("inf")],
     ]
     bucketizer = Bucketizer(splitsArray=splitsArray,
                             inputCols=["values", "values"],
                             outputCols=["b1", "b2"])
     savePath = temp_path + "/bk"
     bucketizer.write().overwrite().save(savePath)
     loadedBucketizer = Bucketizer.load(savePath)
     assert loadedBucketizer.getSplitsArray() == splitsArray
def main(iso_date, base_path):

    APP_NAME = "make_predictions.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load each and every model in the pipeline
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string indexers into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Run the requests through the transformations from training
    #

    # Get today and tomorrow's dates as iso strings to scope query
    today_dt = iso8601.parse_date(iso_date)
    rounded_today = today_dt.date()
    iso_today = rounded_today.isoformat()

    # Build the day's input path: a date based primary key directory structure
    today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
        base_path, iso_today)

    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField

    schema = StructType([
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Timestamp", TimestampType(), True),
    ])

    prediction_requests = spark.read.json(today_input_path, schema=schema)
    prediction_requests.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests.withColumn(
        'Route',
        concat(prediction_requests.Origin, lit('-'), prediction_requests.Dest))
    prediction_requests_with_route.show(6)

    # Index string fields with the corresponding indexer for that column
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay and Distance
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the indexes for the nominal fields
    index_columns = [
        "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
        "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index",
        "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Inspect the output
    final_predictions.show()

    # Build the day's output path: a date based primary key directory structure
    today_output_path = "{}/data/prediction_results_daily.json/{}".format(
        base_path, iso_today)

    # Save the output to its daily bucket
    final_predictions.repartition(1).write.mode("overwrite").json(
        today_output_path)
def main(iso_date, base_path):

  APP_NAME = "make_predictions.py"
  
  # SparkSession이 없으면 환경 생성
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # 파이프라인에 모든 모델을 적재
  #
  
  # 도착 지연 구간 설정 모델을 적재
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # 모든 문자열 인덱서를 dict에 적재
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model
  
  # 수치 벡터 어셈블러 적재
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)
    
  # 분류 모델 적재
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
      base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # 요청을 훈련 데이터로부터 변환을 통해 실행
  #
  
  # 쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기
  today_dt = iso8601.parse_date(iso_date)
  rounded_today = today_dt.date()쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기
  iso_today = rounded_today.isoformat()

  # 해당 날짜의 입력 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조
  today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
    base_path,
    iso_today
  )

  from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField

  schema = StructType([
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Timestamp", TimestampType(), True),
  ])
  
  prediction_requests = spark.read.json(today_input_path, schema=schema)
  prediction_requests.show()

  #
  # FlightNum을 대체할 Route 변수 추가
  #
  
  from pyspark.sql.functions import lit, concat
  prediction_requests_with_route = prediction_requests.withColumn(
    'Route',
    concat(
      prediction_requests.Origin,
      lit('-'),
      prediction_requests.Dest
    )
  )
  prediction_requests_with_route.show(6)
  
  #  해당 열에 대응하는 인덱서로 문자열 필드를 인덱싱
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer_model = string_indexer_models[column]
    prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
      
  # 수치열 벡터화: DepDelay, Distance
  final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
  
  # 명목형 필드를 위한 인덱스 제거
  index_columns = ["Carrier_index", "DayOfMonth_index","DayOfWeek_index",
                   "DayOfYear_index", "Origin_index", "Origin_index",
                   "Dest_index", "Route_index"]
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)

  # 확정된 특징 검사
  final_vectorized_features.show()
  
  # 예측 생성
  predictions = rfc.transform(final_vectorized_features)
  
  # 원래 필드를 제공하기 위해 특징 벡터와 예측 메타데이터를 제거
  predictions = predictions.drop("Features_vec")
  final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
  # 결과 검사
  final_predictions.show()
  
  # 해당 날짜의 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조
  today_output_path = "{}/data/prediction_results_daily.json/{}".format(
    base_path,
    iso_today
  )
  
  # 일별 구간에 결과 저장
  final_predictions.repartition(1).write.mode("overwrite").json(today_output_path)
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # Process data every 10 seconds
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # Add the streaming package and initialize
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Process Prediction Requests in Streaming
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # Create a dataframe from the RDD-based object stream
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # Add a Route variable to replace FlightNum
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # Vectorize string fields with the corresponding pipeline for that column
        # Turn category fields into categoric feature vectors, then drop intermediate fields
        for column in [
                "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
                "Dest", "Route"
        ]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # Vectorize numeric columns: DepDelay, Distance and index columns
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # Inspect the vectors
        final_vectorized_features.show()

        # Drop the individual index columns
        index_columns = [
            "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
            "DayOfYear_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # Inspect the finalized features
        final_vectorized_features.show()

        # Make the prediction
        predictions = rfc.transform(final_vectorized_features)

        # Drop the features vector and prediction metadata to give the original fields
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # Inspect the output
        final_predictions.show()

        # Store to Mongo
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # Do the classification and store to Mongo
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
Esempio n. 5
0
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # 10초마다 데이터 처리
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # 스트리밍 패키지 추가 및 초기화
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # 예측 생성에 사용된 모든 모델 적재
    #

    # 도착 지연 구간화 모델 적재
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # 모든 문자열 필드 벡터화 파이프라인을 dict에 적재
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # 숫자 벡터 어셈블러 적재
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # 분류 모델 적재
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # 스트리밍에서 예측 요청 처리
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # RDD 기반 객체 스트림에서 dataframe 생성
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # FlightNum을 대체할 Route 변수 추가
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # 문자열 필드를 해당 열에 대응하는 파이프라인으로 벡터화
        # 범주 필드를 범주형 특징 벡터로 변환한 다음 중간 결과 필드 삭제
        for column in ["Carrier", "Origin", "Dest", "Route"]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # 숫사 열 벡터화: DepDelay, Distance, 인덱스 열
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # 벡터 검사
        final_vectorized_features.show()

        # 개별 인덱스 열 제거
        index_columns = [
            "Carrier_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # 확정된 특징 검사
        final_vectorized_features.show()

        # 예측 생성
        predictions = rfc.transform(final_vectorized_features)

        # 원 필드에 제공하기 위해 특징 벡터와 예측 메타데이터 제거
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # 결과 검사
        final_predictions.show()

        # 몽고DB에 저장
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # 분류를 수행하고 몽고 DB에 저장
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
def main(base_path):

  APP_NAME = "make_predictions_streaming.py"

  # Process data every 10 seconds
  PERIOD = 10
  BROKERS = 'localhost:9092'
  PREDICTION_TOPIC = 'flight_delay_classification_request'
  
  try:
    sc and ssc
  except NameError as e:
    import findspark

    # Add the streaming package and initialize
    findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
    findspark.init()
    
    import pyspark
    import pyspark.sql
    import pyspark.streaming
  
    conf = SparkConf().set("spark.default.parallelism", 1)
    sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
    ssc = StreamingContext(sc, PERIOD)
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # Load all models to be used in making predictions
  #
  
  # Load the arrival delay bucketizer
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # Load all the string field vectorizer pipelines into a dict
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model

  # Load the numeric vector assembler
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)

  # Load the classifier model
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # Process Prediction Requests in Streaming
  #
  stream = KafkaUtils.createDirectStream(
    ssc,
    [PREDICTION_TOPIC],
    {
      "metadata.broker.list": BROKERS,
      "group.id": "0",
    }
  )

  object_stream = stream.map(lambda x: json.loads(x[1]))
  object_stream.pprint()
  
  row_stream = object_stream.map(
    lambda x: Row(
      FlightDate=iso8601.parse_date(x['FlightDate']),
      Origin=x['Origin'],
      Distance=x['Distance'],
      DayOfMonth=x['DayOfMonth'],
      DayOfYear=x['DayOfYear'],
      UUID=x['UUID'],
      DepDelay=x['DepDelay'],
      DayOfWeek=x['DayOfWeek'],
      FlightNum=x['FlightNum'],
      Dest=x['Dest'],
      Timestamp=iso8601.parse_date(x['Timestamp']),
      Carrier=x['Carrier']
    )
  )
  row_stream.pprint()

  #
  # Create a dataframe from the RDD-based object stream
  #

  def classify_prediction_requests(rdd):
  
    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
  
    prediction_request_schema = StructType([
      StructField("Carrier", StringType(), True),
      StructField("DayOfMonth", IntegerType(), True),
      StructField("DayOfWeek", IntegerType(), True),
      StructField("DayOfYear", IntegerType(), True),
      StructField("DepDelay", DoubleType(), True),
      StructField("Dest", StringType(), True),
      StructField("Distance", DoubleType(), True),
      StructField("FlightDate", DateType(), True),
      StructField("FlightNum", StringType(), True),
      StructField("Origin", StringType(), True),
      StructField("Timestamp", TimestampType(), True),
      StructField("UUID", StringType(), True),
    ])
    
    prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema)
    prediction_requests_df.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests_df.withColumn(
      'Route',
      concat(
        prediction_requests_df.Origin,
        lit('-'),
        prediction_requests_df.Dest
      )
    )
    prediction_requests_with_route.show(6)
  
    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
      string_indexer_model = string_indexer_models[column]
      prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
  
    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
    
    # Inspect the vectors
    final_vectorized_features.show()
  
    # Drop the individual index columns
    index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"]
    for column in index_columns:
      final_vectorized_features = final_vectorized_features.drop(column)
  
    # Inspect the finalized features
    final_vectorized_features.show()
  
    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)
  
    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
    # Inspect the output
    final_predictions.show()
  
    # Store to Mongo
    if final_predictions.count() > 0:
      final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
        "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
      )
  
  # Do the classification and store to Mongo
  row_stream.foreachRDD(classify_prediction_requests)
  
  ssc.start()
  ssc.awaitTermination()
Esempio n. 7
0
def main(base_path):

    spark = SparkSession.builder.config("spark.default.parallelism",
                                        1).appName(APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Messages look like:
    #

    # {
    #   "Carrier": "DL",
    #   "DayOfMonth": 25,
    #   "DayOfWeek": 4,
    #   "DayOfYear": 359,
    #   "DepDelay": 10.0,
    #   "Dest": "LAX",
    #   "Distance": 2475.0,
    #   "FlightDate": "2015-12-25",
    #   "FlightNum": null,
    #   "Origin": "JFK",
    #   "Timestamp": "2019-10-31T00:19:47.633280",
    #   "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385"
    # }

    #
    # Process Prediction Requests from Kafka
    #
    message_df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", BROKERS) \
      .option("subscribe", PREDICTION_TOPIC) \
      .load()

    # Create a DataFrame out of the one-hot encoded RDD
    schema = T.StructType([
        T.StructField("Carrier", T.StringType()),
        T.StructField("DayOfMonth", T.IntegerType()),
        T.StructField("DayOfWeek", T.IntegerType()),
        T.StructField("DayOfYear", T.IntegerType()),
        T.StructField("DepDelay", T.FloatType()),
        T.StructField("Dest", T.StringType()),
        T.StructField("Distance", T.FloatType()),
        T.StructField("FlightDate", T.StringType()),
        T.StructField("FlightNum", T.StringType()),
        T.StructField("Origin", T.StringType()),
        T.StructField("Timestamp", T.TimestampType()),
        T.StructField("UUID", T.StringType()),
    ])

    prediction_requests_df = message_df.select(
        F.from_json(F.col("value").cast("string"),
                    schema).alias("data")).select("data.*")

    #
    # Add a Route variable to replace FlightNum
    #
    prediction_requests_with_route = prediction_requests_df.withColumn(
        'Route',
        F.concat(prediction_requests_df.Origin, F.lit('-'),
                 prediction_requests_df.Dest))

    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the individual index columns
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Store the results to MongoDB
    class MongoWriter:
        def open(self, partition_id, epoch_id):
            print(f"Opened partition id: {partition_id}, epoch: {epoch_id}")

            self.mongo_client = pymongo.MongoClient()
            print(f"Opened MongoClient: {self.mongo_client}")

            return True

        def process(self, row):
            print(f"Processing row: {row}")

            as_dict = row.asDict()
            print(f"Inserting row.asDict(): {as_dict}")

            id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one(
                as_dict)
            print(f"Inserted row, got ID: {id.inserted_id}")

            self.mongo_client.close()

            return True

        def close(self, error):
            print("Closed with error: %s" % str(error))

            return True

    query = final_predictions.writeStream.foreach(MongoWriter()).start()

    query.awaitTermination()
Esempio n. 8
0
def main(iso_date, base_path):

    APP_NAME = "make_predictions.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load each and every model in the pipeline
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load the departure delay bucketizer
    departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format(
        base_path)
    departure_bucketizer = Bucketizer.load(departure_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml import PipelineModel

    string_vectorizer_pipeline_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "FlightNum", "DepDelayBucket"
    ]:
        string_pipeline_model_path = "{}/models/string_indexer_pipeline_model_{}.bin".format(
            base_path, column)
        string_pipeline_model = PipelineModel.load(string_pipeline_model_path)
        string_vectorizer_pipeline_models[column] = string_pipeline_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the final assembler
    final_assembler_path = "{}/models/final_vector_assembler.bin".format(
        base_path)
    final_assembler = VectorAssembler.load(final_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Run the requests through the transformations from training
    #

    # Get today and tomorrow's dates as iso strings to scope query
    today_dt = iso8601.parse_date(iso_date)
    rounded_today = today_dt.date()
    iso_today = rounded_today.isoformat()

    # Build the day's input path: a date based primary key directory structure
    today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
        base_path, iso_today)

    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField

    schema = StructType([
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Timestamp", TimestampType(), True),
    ])

    prediction_requests = spark.read.json(today_input_path, schema=schema)
    prediction_requests.show()

    # Bucketize the departure and arrival delays for classification
    ml_bucketized_features = departure_bucketizer.transform(
        prediction_requests)

    # Check the buckets
    ml_bucketized_features.select("DepDelay", "DepDelayBucket").show()

    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "FlightNum", "DepDelayBucket"
    ]:
        string_pipeline_path = "{}/models/string_indexer_pipeline_{}.bin".format(
            base_path, column)
        string_pipeline_model = string_vectorizer_pipeline_models[column]
        ml_bucketized_features = string_pipeline_model.transform(
            ml_bucketized_features)
        ml_bucketized_features = ml_bucketized_features.drop(column + "_index")

    # Vectorize numeric columns
    ml_bucketized_features = vector_assembler.transform(ml_bucketized_features)

    # Drop the original numeric columns
    numeric_columns = ["DepDelay", "Distance"]

    # Combine various features into one feature vector, 'features'
    final_vectorized_features = final_assembler.transform(
        ml_bucketized_features)
    final_vectorized_features.show()

    # Drop the individual vector columns
    feature_columns = [
        "Carrier_vec", "DayOfMonth_vec", "DayOfWeek_vec", "DayOfYear_vec",
        "Origin_vec", "Dest_vec", "FlightNum_vec", "DepDelayBucket_vec",
        "NumericFeatures_vec"
    ]
    for column in feature_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Inspect the output
    final_predictions.show()

    # Build the day's output path: a date based primary key directory structure
    today_output_path = "{}/data/prediction_results_daily.json/{}".format(
        base_path, iso_today)

    # Save the output to its daily bucket
    final_predictions.repartition(1).write.mode("overwrite").json(
        today_output_path)