Ejemplo n.º 1
0
def main():
    # spark = SparkSession.builder.appName('google-play-store-streamer').getOrCreate()
    sc = SparkContext(appName="PysparkStreaming").getOrCreate()
    ssc = StreamingContext(sc, 3)

    # Load Model
    model = RandomForestClassificationModel.load(MODEL_PATH)

    def parseStream(rdd):
        if not rdd.isEmpty():
            df = sc.read.json(rdd)
            df.show()
            # Vectorize data
            feature_cols = df.columns
            feature_cols.remove('Installs indexed')
            assembler = VectorAssembler(inputCols=feature_cols,
                                        outputCol="features",
                                        handleInvalid="error")
            pipeline = Pipeline(stages=[assembler])
            outputModel = pipeline.fit(df)
            output = outputModel.transform(df)
            final_data = output.select("features", "Installs indexed")
            # Predict
            predictions = model.transform(final_data)
            evaluator = MulticlassClassificationEvaluator(
                labelCol="Installs indexed",
                predictionCol="prediction",
                metricName="accuracy")
            accuracy = evaluator.evaluate(predictions)
            print("Random forest test Error = %g" % (1.0 - accuracy))
            randomForestError = (1.0 - accuracy)
            print(randomForestError)

    stream_data = ssc.textFileStream('StreamData/')
    stream_data.foreachRDD(lambda rdd: parseStream(rdd))

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 2
0
def sendRecord(df):

    from pyspark.ml.classification import RandomForestClassificationModel
    sameModel = RandomForestClassificationModel.load("randomForest.model")

    df = df.withColumn("amount", df["amount"].cast(FloatType()))
    df = df.withColumn("newbalanceDest",
                       df["newbalanceDest"].cast(FloatType()))
    df = df.withColumn("newbalanceOrig",
                       df["newbalanceOrig"].cast(FloatType()))
    df = df.withColumn("oldbalanceDest",
                       df["oldbalanceDest"].cast(FloatType()))
    df = df.withColumn("oldbalanceOrg", df["oldbalanceOrg"].cast(FloatType()))
    df = df.withColumn("isFlaggedFraud",
                       df["isFlaggedFraud"].cast(IntegerType()))
    df = df.withColumn("step", df["step"].cast(IntegerType()))
    df = df.withColumn("Type", df["Type"].cast(IntegerType()))

    assembler = VectorAssembler(inputCols=[
        "Type", "amount", "newbalanceDest", "newbalanceOrig", "oldbalanceDest",
        "oldbalanceOrg", "step"
    ],
                                outputCol="features")

    output = assembler.transform(df).select("features")

    predictions = sameModel.transform(output)

    pr = predictions.select("prediction")

    pr = pr.rdd

    if (str(pr.collect() == [Row(prediction=1.0)]) == "True"):
        print("FRAUD!!!!")
    else:
        print("Not Fraud")
Ejemplo n.º 3
0
def main(iso_date, base_path):

    APP_NAME = "make_predictions.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load each and every model in the pipeline
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string indexers into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Run the requests through the transformations from training
    #

    # Get today and tomorrow's dates as iso strings to scope query
    today_dt = iso8601.parse_date(iso_date)
    rounded_today = today_dt.date()
    iso_today = rounded_today.isoformat()

    # Build the day's input path: a date based primary key directory structure
    today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
        base_path, iso_today)

    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField

    schema = StructType([
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Timestamp", TimestampType(), True),
    ])

    prediction_requests = spark.read.json(today_input_path, schema=schema)
    prediction_requests.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests.withColumn(
        'Route',
        concat(prediction_requests.Origin, lit('-'), prediction_requests.Dest))
    prediction_requests_with_route.show(6)

    # Index string fields with the corresponding indexer for that column
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay and Distance
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the indexes for the nominal fields
    index_columns = [
        "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
        "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index",
        "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Inspect the output
    final_predictions.show()

    # Build the day's output path: a date based primary key directory structure
    today_output_path = "{}/data/prediction_results_daily.json/{}".format(
        base_path, iso_today)

    # Save the output to its daily bucket
    final_predictions.repartition(1).write.mode("overwrite").json(
        today_output_path)
Ejemplo n.º 4
0
predictions = rForestModel.transform(pTestDF)

# %%
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1")
evaluator.evaluate(predictions)

# %%

lr = LogisticRegression(featuresCol='features', labelCol='class')
lrModel = lr.fit(pTrainDF)
predictionsLR = lrModel.transform(pTestDF)
evaluator.evaluate(predictionsLR)

# %%
naiveBayes = NaiveBayes(featuresCol='features', labelCol='class')
naiveModel = naiveBayes.fit(pTrainDF)
predictionsNaive = naiveModel.transform(pTestDF)
evaluator.evaluate(predictionsNaive)


# %%
pipelineModel.save('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/pipelineW2V')
rForestModel.save('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/rForest')

#%%
pipelineModel = PipelineModel.load('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/pipelineW2V')
rForestModel = RandomForestClassificationModel.load('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/rForest')


# %%
def main(iso_date, base_path):

  APP_NAME = "make_predictions.py"
  
  # SparkSession이 없으면 환경 생성
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # 파이프라인에 모든 모델을 적재
  #
  
  # 도착 지연 구간 설정 모델을 적재
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # 모든 문자열 인덱서를 dict에 적재
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model
  
  # 수치 벡터 어셈블러 적재
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)
    
  # 분류 모델 적재
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
      base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # 요청을 훈련 데이터로부터 변환을 통해 실행
  #
  
  # 쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기
  today_dt = iso8601.parse_date(iso_date)
  rounded_today = today_dt.date()쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기
  iso_today = rounded_today.isoformat()

  # 해당 날짜의 입력 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조
  today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
    base_path,
    iso_today
  )

  from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField

  schema = StructType([
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Timestamp", TimestampType(), True),
  ])
  
  prediction_requests = spark.read.json(today_input_path, schema=schema)
  prediction_requests.show()

  #
  # FlightNum을 대체할 Route 변수 추가
  #
  
  from pyspark.sql.functions import lit, concat
  prediction_requests_with_route = prediction_requests.withColumn(
    'Route',
    concat(
      prediction_requests.Origin,
      lit('-'),
      prediction_requests.Dest
    )
  )
  prediction_requests_with_route.show(6)
  
  #  해당 열에 대응하는 인덱서로 문자열 필드를 인덱싱
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer_model = string_indexer_models[column]
    prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
      
  # 수치열 벡터화: DepDelay, Distance
  final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
  
  # 명목형 필드를 위한 인덱스 제거
  index_columns = ["Carrier_index", "DayOfMonth_index","DayOfWeek_index",
                   "DayOfYear_index", "Origin_index", "Origin_index",
                   "Dest_index", "Route_index"]
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)

  # 확정된 특징 검사
  final_vectorized_features.show()
  
  # 예측 생성
  predictions = rfc.transform(final_vectorized_features)
  
  # 원래 필드를 제공하기 위해 특징 벡터와 예측 메타데이터를 제거
  predictions = predictions.drop("Features_vec")
  final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
  # 결과 검사
  final_predictions.show()
  
  # 해당 날짜의 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조
  today_output_path = "{}/data/prediction_results_daily.json/{}".format(
    base_path,
    iso_today
  )
  
  # 일별 구간에 결과 저장
  final_predictions.repartition(1).write.mode("overwrite").json(today_output_path)
Ejemplo n.º 6
0
    def label_failure_modes(cls, site, did, rd_item, df, model_dir, sc):
        '''
        : param site: site, e.g. 'fab15', 'fab10'
        : did: design id, e.g. 'Z32D'
        : rd_item: rd bin in string format, e.g. 'rdC'
        '''
        start_time = time.time()
        #Convert to Pandas dataframe
        #        df = df.toPandas()
        #        if 'FBD_REGION' in df.columns:
        #            df['FBD_REGION'] = df['FBD_REGION'].apply(lambda x : cls.label_zone(x))

        labelled_failure_modes = []
        df = df.withColumn("row_id", F.monotonically_increasing_id())
        model_features_list, model_name_list, model_dir_list = cls.__read_model_name(
            site, did, rd_item, model_dir)
        print(model_dir_list)
        if len(model_name_list) > 0:
            for name, features, dirname in zip(model_name_list,
                                               model_features_list,
                                               model_dir_list):
                features_missing = [e for e in features if e not in df.columns]
                if len(features_missing) > 0:
                    print('Features %s missing for model %s' %
                          (','.join(features_missing), name))
                else:
                    print(dirname)
                    print(features)
                    try:
                        model = RandomForestClassificationModel.load(
                            str(dirname))
                        # model = LinearSVCModel.load(model_dir)
                        assembler = VectorAssembler(inputCols=features,
                                                    outputCol="features")
                        # Set maxCategories so features with > 4 distinct values are treated as continuous.
                        newData = assembler.transform(df)
                        df_i = model.transform(newData)
                        #df_i = cls.pred_rf_model_spark(dirname, feature, name, df)
                        df_i = df_i.withColumnRenamed("prediction", name)
                        df = df.join(df_i.select("row_id", name), ("row_id"))
                        labelled_failure_modes.append(name)
                        print('Labelling done for: ', name)
                    except:
                        print('Labelling failed for: ', name)
            if len(labelled_failure_modes) > 0:
                df = df.withColumn(
                    'total', sum(df[col] for col in labelled_failure_modes))
                df_labelled = df.filter(df.total > 0)
                df_unlabelled = df.filter(df.total == 0)
            else:
                df_labelled = []
                df_unlabelled = df
        else:
            df_labelled = []
            df_unlabelled = df
            print('No models found for: %s, %s, %s' % (site, did, rd_item))

        print('Labelling time = ', time.time() - start_time)
        start_time = time.time()
        if df_labelled != []:
            df_labelled = df_labelled.toPandas()
        else:
            df_labelled = pd.DataFrame()
        df_unlabelled = df_unlabelled.toPandas()
        print('Pandas df conversion time = ', time.time() - start_time)
        return df_labelled, df_unlabelled, labelled_failure_modes
print(wine.limit(20))

# In[ ]:
from pyspark.ml.feature import VectorAssembler

# select the columns to be used as the features (all except `quality`)
featureColumns = [c for c in wine.columns if c != 'quality']

# create and configure the assembler
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")

# transform the original data
dataDF = assembler.transform(wine)
dataDF.printSchema()

# calculate the average wine quality
avgQuality = wine.groupBy().avg('quality').first()[0]
print(avgQuality)

from pyspark.ml.classification import RandomForestClassificationModel

rfObjectFileLoaded = sc._jsc.objectFile(
    "hdfs://ec2-3-88-182-126.compute-1.amazonaws.com:9000/home/ubuntu/sparkfolder/spark-2.4.7-bin-hadoop2.7/output/rf.model"
)
rfModelLoaded_JavaObject = rfObjectFileLoaded.first()
rfModelLoaded = RandomForestClassificationModel(rfModelLoaded_JavaObject)
loadedPredictionsDF = rfModelLoaded.transform(wine)

# evaluate the model again to see if we get the same performance
print("Loaded model RMSE = %g" % evaluator.evaluate(loadedPredictionsDF))
# Calculate and print Recall score for Decision Tree Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
dtcWeightedRecall = evaluator.evaluate(dtcPredictions)
print("Decision Tree weightedRecall Error = %g" % (dtcWeightedRecall))

# Train a RandomForest algorithm
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
rfm = rf.fit(trainingData)

# Save trained Logistic Regression Model to s3 Bucket for future use
rfm.save('s3://expedia-hotel-recommendations-workflow/rfm_model')

# Load Pre-Trained Logistic Regression Model to illistrate how model will be imported for future use
rfModel = RandomForestClassificationModel.load("s3://expedia-hotel-recommendations-workflow/rfm_model")

# Make predictions with Random Forest model
rfPredictions = rfModel.transform(testData)

# Calculate and print Accuracy score for Random Forest Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
rfAccuracy = evaluator.evaluate(rfPredictions)
print("Random Forest accuracy Error = %g" % (rfAccuracy))

# Calculate and print F1 score for Random Forest Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
rfF1 = evaluator.evaluate(rfPredictions)
print("Random Forest f1 Error = %g" % (rfF1))
Ejemplo n.º 9
0
spark = SparkSession.builder.master("local").appName(
    "wineClasssification").getOrCreate()

######################### Reading Dataset########################
testDf = spark.read.csv('TestDataset.csv',
                        header='true',
                        inferSchema='true',
                        sep=';')
#testDf = spark.read.csv('hdfs://ip-172-31-19-75.ec2.internal:8020/TestDataset.csv',header='true', inferSchema='true', sep=';')
feature = [c for c in testDf.columns if c != 'quality']
assembler_test = VectorAssembler(inputCols=feature, outputCol="features")
test_trans = assembler_test.transform(testDf)
#test_trans.printSchema()

######################### Loading Model ############################
model = RandomForestClassificationModel.load("wine_train_model")

######################### Predicting ##########################
predictions = model.transform(test_trans)
##Value inside show this is just for printing number of value
#predictions.select("quality", "features").show(1000)

######################### Printing Accuracy ##########################
eval = MulticlassClassificationEvaluator(labelCol="quality",
                                         predictionCol="prediction",
                                         metricName="accuracy")
accuracy = eval.evaluate(predictions)
print("accuracy test Error = %g" % (1.0 - accuracy))

from pyspark.mllib.evaluation import MulticlassMetrics
transformed_data = model.transform(test_trans)
Ejemplo n.º 10
0
def loadRFModel(df):
    assembler = VectorAssembler(inputCols=['cid', 'GPA'], outputCol='features')
    output = assembler.transform(df)
    model = RandomForestClassificationModel.load("rf_model")
    ret = model.transform(output).select('cid', 'prediction')
    return ret.head(7)
Ejemplo n.º 11
0
#helper functions – helper.py file
from pyspark.sql import functions as F
import pickle
from pyspark.ml import PipelineModel
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, DoubleType

# read model objects saved from the training process
path_to_read_objects = '/deploy'

#pyspark objects
char_labels = PipelineModel.load(path_to_read_objects + '/char_label_model.h5')
assembleModel = PipelineModel.load(path_to_read_objects + '/assembleModel.h5')
clf_model = RandomForestClassificationModel.load(path_to_read_objects +
                                                 '/clf_model.h5')
#python objects
with open(path_to_read_objects + '/file.pkl', 'rb') as handle:
    features_list, char_vars, num_vars = pickle.load(handle)


#make necessary transformations
def rename_columns(df, char_vars):
    mapping = dict(zip([i + '_index' for i in char_vars], char_vars))
    df = df.select([F.col(c).alias(mapping.get(c, c)) for c in df.columns])
    return df


# score the new data
def score_new_df(scoredf):
    X = scoredf.select(features_list)
Ejemplo n.º 12
0
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # 10초마다 데이터 처리
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # 스트리밍 패키지 추가 및 초기화
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # 예측 생성에 사용된 모든 모델 적재
    #

    # 도착 지연 구간화 모델 적재
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # 모든 문자열 필드 벡터화 파이프라인을 dict에 적재
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # 숫자 벡터 어셈블러 적재
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # 분류 모델 적재
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # 스트리밍에서 예측 요청 처리
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # RDD 기반 객체 스트림에서 dataframe 생성
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # FlightNum을 대체할 Route 변수 추가
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # 문자열 필드를 해당 열에 대응하는 파이프라인으로 벡터화
        # 범주 필드를 범주형 특징 벡터로 변환한 다음 중간 결과 필드 삭제
        for column in ["Carrier", "Origin", "Dest", "Route"]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # 숫사 열 벡터화: DepDelay, Distance, 인덱스 열
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # 벡터 검사
        final_vectorized_features.show()

        # 개별 인덱스 열 제거
        index_columns = [
            "Carrier_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # 확정된 특징 검사
        final_vectorized_features.show()

        # 예측 생성
        predictions = rfc.transform(final_vectorized_features)

        # 원 필드에 제공하기 위해 특징 벡터와 예측 메타데이터 제거
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # 결과 검사
        final_predictions.show()

        # 몽고DB에 저장
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # 분류를 수행하고 몽고 DB에 저장
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
def main(base_path):

  APP_NAME = "make_predictions_streaming.py"

  # Process data every 10 seconds
  PERIOD = 10
  BROKERS = 'localhost:9092'
  PREDICTION_TOPIC = 'flight_delay_classification_request'
  
  try:
    sc and ssc
  except NameError as e:
    import findspark

    # Add the streaming package and initialize
    findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
    findspark.init()
    
    import pyspark
    import pyspark.sql
    import pyspark.streaming
  
    conf = SparkConf().set("spark.default.parallelism", 1)
    sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
    ssc = StreamingContext(sc, PERIOD)
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # Load all models to be used in making predictions
  #
  
  # Load the arrival delay bucketizer
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # Load all the string field vectorizer pipelines into a dict
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model

  # Load the numeric vector assembler
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)

  # Load the classifier model
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # Process Prediction Requests in Streaming
  #
  stream = KafkaUtils.createDirectStream(
    ssc,
    [PREDICTION_TOPIC],
    {
      "metadata.broker.list": BROKERS,
      "group.id": "0",
    }
  )

  object_stream = stream.map(lambda x: json.loads(x[1]))
  object_stream.pprint()
  
  row_stream = object_stream.map(
    lambda x: Row(
      FlightDate=iso8601.parse_date(x['FlightDate']),
      Origin=x['Origin'],
      Distance=x['Distance'],
      DayOfMonth=x['DayOfMonth'],
      DayOfYear=x['DayOfYear'],
      UUID=x['UUID'],
      DepDelay=x['DepDelay'],
      DayOfWeek=x['DayOfWeek'],
      FlightNum=x['FlightNum'],
      Dest=x['Dest'],
      Timestamp=iso8601.parse_date(x['Timestamp']),
      Carrier=x['Carrier']
    )
  )
  row_stream.pprint()

  #
  # Create a dataframe from the RDD-based object stream
  #

  def classify_prediction_requests(rdd):
  
    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
  
    prediction_request_schema = StructType([
      StructField("Carrier", StringType(), True),
      StructField("DayOfMonth", IntegerType(), True),
      StructField("DayOfWeek", IntegerType(), True),
      StructField("DayOfYear", IntegerType(), True),
      StructField("DepDelay", DoubleType(), True),
      StructField("Dest", StringType(), True),
      StructField("Distance", DoubleType(), True),
      StructField("FlightDate", DateType(), True),
      StructField("FlightNum", StringType(), True),
      StructField("Origin", StringType(), True),
      StructField("Timestamp", TimestampType(), True),
      StructField("UUID", StringType(), True),
    ])
    
    prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema)
    prediction_requests_df.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests_df.withColumn(
      'Route',
      concat(
        prediction_requests_df.Origin,
        lit('-'),
        prediction_requests_df.Dest
      )
    )
    prediction_requests_with_route.show(6)
  
    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
      string_indexer_model = string_indexer_models[column]
      prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
  
    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
    
    # Inspect the vectors
    final_vectorized_features.show()
  
    # Drop the individual index columns
    index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"]
    for column in index_columns:
      final_vectorized_features = final_vectorized_features.drop(column)
  
    # Inspect the finalized features
    final_vectorized_features.show()
  
    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)
  
    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
    # Inspect the output
    final_predictions.show()
  
    # Store to Mongo
    if final_predictions.count() > 0:
      final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
        "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
      )
  
  # Do the classification and store to Mongo
  row_stream.foreachRDD(classify_prediction_requests)
  
  ssc.start()
  ssc.awaitTermination()
Ejemplo n.º 14
0
def main(base_path):

    spark = SparkSession.builder.config("spark.default.parallelism",
                                        1).appName(APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Messages look like:
    #

    # {
    #   "Carrier": "DL",
    #   "DayOfMonth": 25,
    #   "DayOfWeek": 4,
    #   "DayOfYear": 359,
    #   "DepDelay": 10.0,
    #   "Dest": "LAX",
    #   "Distance": 2475.0,
    #   "FlightDate": "2015-12-25",
    #   "FlightNum": null,
    #   "Origin": "JFK",
    #   "Timestamp": "2019-10-31T00:19:47.633280",
    #   "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385"
    # }

    #
    # Process Prediction Requests from Kafka
    #
    message_df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", BROKERS) \
      .option("subscribe", PREDICTION_TOPIC) \
      .load()

    # Create a DataFrame out of the one-hot encoded RDD
    schema = T.StructType([
        T.StructField("Carrier", T.StringType()),
        T.StructField("DayOfMonth", T.IntegerType()),
        T.StructField("DayOfWeek", T.IntegerType()),
        T.StructField("DayOfYear", T.IntegerType()),
        T.StructField("DepDelay", T.FloatType()),
        T.StructField("Dest", T.StringType()),
        T.StructField("Distance", T.FloatType()),
        T.StructField("FlightDate", T.StringType()),
        T.StructField("FlightNum", T.StringType()),
        T.StructField("Origin", T.StringType()),
        T.StructField("Timestamp", T.TimestampType()),
        T.StructField("UUID", T.StringType()),
    ])

    prediction_requests_df = message_df.select(
        F.from_json(F.col("value").cast("string"),
                    schema).alias("data")).select("data.*")

    #
    # Add a Route variable to replace FlightNum
    #
    prediction_requests_with_route = prediction_requests_df.withColumn(
        'Route',
        F.concat(prediction_requests_df.Origin, F.lit('-'),
                 prediction_requests_df.Dest))

    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the individual index columns
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Store the results to MongoDB
    class MongoWriter:
        def open(self, partition_id, epoch_id):
            print(f"Opened partition id: {partition_id}, epoch: {epoch_id}")

            self.mongo_client = pymongo.MongoClient()
            print(f"Opened MongoClient: {self.mongo_client}")

            return True

        def process(self, row):
            print(f"Processing row: {row}")

            as_dict = row.asDict()
            print(f"Inserting row.asDict(): {as_dict}")

            id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one(
                as_dict)
            print(f"Inserted row, got ID: {id.inserted_id}")

            self.mongo_client.close()

            return True

        def close(self, error):
            print("Closed with error: %s" % str(error))

            return True

    query = final_predictions.writeStream.foreach(MongoWriter()).start()

    query.awaitTermination()
    #          67673, 67688, 67689, 67690, 67691, 67701, 67708, 67709, 67718, 67719, 67728, 67961, 67962, 67977, 67978,
    #          67980, 67981, 67984, 67989, 67990, 67995, 67996, 67997, 67999, 68007, 68009, 68010, 68011, 68017, 68018,
    #          68019, 68251, 68267, 68268, 68269, 68270, 68274, 68279, 68284, 68285, 68286, 68287, 68288, 68301, 68307,
    #          68308, 68556, 68557, 68558, 68559, 68560, 68562, 68563, 68569, 68573, 68574, 68575, 68576, 68591, 68597,
    #          68598, 68846, 68847, 68848, 68849, 68850, 68853, 68858, 68859, 68861, 68866, 68870, 68881, 68885, 68887,
    #          68888, 68889, 69136, 69137, 69138, 69139, 69140, 69141, 69142, 69144, 69148, 69150, 69151, 69154, 69156,
    #          69169, 69170, 69171, 69172, 69177, 69425, 69426, 69427, 69429, 69432, 69433, 69434, 69438, 69442, 69443,
    #          69446, 69455, 69462, 69467, 69715, 69716, 69718, 69722, 69724, 69728, 69731, 69732, 69736, 69742, 69743,
    #          69744, 69745, 69751, 69752, 69753, 69755, 69757, 70005, 70006, 70008, 70012, 70018, 70030, 70031, 70032,
    #          70033, 70034, 70042, 70043, 70044, 70045, 70046, 70048, 70049, 70295, 70296, 70297, 70298, 70299, 70300,
    #          70302, 70303, 70309, 70317, 70318, 70319, 70320, 70324, 70333, 70334, 70336, 70566, 70585, 70586, 70587,
    #          70588, 70590, 70591, 70592, 70593, 70606, 70607, 70610, 70623, 70624, 70625, 70626, 70855, 70856, 70875,
    #          70877, 70878, 70879, 70880, 70881, 70882, 70883, 70887, 70901, 70910, 70915, 70916, 71144, 71145, 71146,
    #          71165, 71166, 71168, 71169, 71170, 71171, 71172, 71202, 71206, 71435, 71436, 71455, 71467, 71485, 71724,
    #          71744, 71745, 71746, 71747, 71757, 71758, 71771, 71772, 71775, 72012, 72013, 72014, 72015, 72036, 72037,
    #          72038, 72039, 72060, 72061, 72062, 72325, 72326, 72327, 72328, 72329, 72334, 72348, 72349, 72350, 72591,
    #          72616, 72617, 72618, 72624, 72625, 72884, 72907, 72908, 72909, 72913, 72916, 72917, 73182, 73194, 73195,
    #          73197, 73203, 73205, 73472, 73485, 73486, 73487, 73489, 73491, 73492, 73494, 73775, 73776, 73781, 73782,
    #          73783, 73784, 74061, 74062, 74065, 74066, 74070, 74071, 74072, 74073, 74091, 74351, 74352, 74353, 74354,
    #          74355, 74356, 74359, 74361, 74362, 74381, 74641, 74642, 74643, 74644, 74645, 74646, 74649, 74650, 74651,
    #          74652, 74922, 74936, 74940, 75226, 75229, 75230, 75520, 75817, 76384, 76385, 76391, 76397, 76402, 76687,
    #          76691, 76692, 76962, 77251, 77252, 77255, 77256, 77540, 77541, 77542, 77828, 77829, 77830, 77831, 77832,
    #          78118, 78119, 78122, 78409, 78410, 78411, 78412, 79862, 80152]
    mlist = []
    # ilist = random.sample(ilist, 10)
    index = 40664
    model = RandomForestClassificationModel.load(
        'hdfs://master:9000//fcd/split/serialModel/model_{}'.format(index))
    mlist.append(model)
    sc.stop()
Ejemplo n.º 16
0
                                        maxDepth=10)
    model = classifier.fit(train_data)

    # Transform the test data using the model to get predictions
    predicted_test_data = model.transform(test_data)

    # Evaluate the model performance
    evaluator_f1 = MulticlassClassificationEvaluator(
        labelCol='gender', predictionCol='prediction', metricName='f1')
    print("F1 score: {}", evaluator_f1.evaluate(predicted_test_data))

    evaluator_accuracy = MulticlassClassificationEvaluator(
        labelCol='gender', predictionCol='prediction', metricName='accuracy')
    print("Accuracy: {}", evaluator_accuracy.evaluate(predicted_test_data))

    # Predict some new records
    # In real case, use VectorAssembler to transform df for features column
    data_to_predict = final_data.select("features").limit(10)
    model.transform(data_to_predict).show()

    # Save the model
    model.save("hdfs://devenv/user/spark/web_logs_analysis/gender_model/")

    # Read the saved model
    model_reloaded = RandomForestClassificationModel.load(
        "hdfs://devenv/user/spark/web_logs_analysis/gender_model/")

    # Predict some new records
    # In real case, use VectorAssembler to transform df for features column
    data_to_predict = final_data.select("features").limit(10)
    model_reloaded.transform(data_to_predict).show()
Ejemplo n.º 17
0
                        featuresCol='X')
LR_model = LR.fit(X_train_large)
LR_model.save(LR_model_path)

# Random Forest
RF = RandomForestClassifier(numTrees=100,
                            maxDepth=15,
                            labelCol="score",
                            featuresCol="X")
RF_model = RF.fit(X_train_large)
RF_model.save(RF_model_path)

# Loading all trained models
NB_Model = NaiveBayesModel.load(NB_model_path)
LR_Model = LogisticRegressionModel.load(LR_model_path)
RF_Model = RandomForestClassificationModel.load(RF_model_path)

voteClassifier = VoteClassifier(NB_Model, LR_Model, RF_Model)
evaluate(voteClassifier.transform_vote(X_test_large),
         confusion=False,
         predictionCol='prediction_vote')
evaluate(voteClassifier.transform_vote(X_test_imbd),
         confusion=False,
         predictionCol='prediction_vote')
voteClassifier.transform_vote(X_test_imbd).show()

# Accuracy: (TP+TN)/N
# Positive Predicitve Value: TP/(TP+FP)
# Negative Predicitve Value: TN/(TN+FN)
import matplotlib.pyplot as plt
import sys

#Create and connect to spark session, read data given in docker command
spark = SparkSession.builder.master('local[*]').appName(
    'Predict_model').getOrCreate()
test_set = spark.read.csv(sys.argv[-1], header=True, inferSchema=True, sep=';')

# Create feature vector
assembler = VectorAssembler(inputCols=[
    test_set.columns[0], test_set.columns[1], test_set.columns[2],
    test_set.columns[3], test_set.columns[4], test_set.columns[5],
    test_set.columns[6], test_set.columns[7], test_set.columns[8],
    test_set.columns[9], test_set.columns[10]
],
                            outputCol='features')
test_assembled = assembler.transform(test_set)
test_assembled = test_assembled.select(test_assembled.columns[-1],
                                       test_assembled.columns[-2])

# Load trained classification model
rfp = RandomForestClassificationModel.load('RF_model')

#Predict classes of new data
predictions = rfp.transform(test_assembled)

#Evaluate model performance
multi_evaluator = MulticlassClassificationEvaluator(
    labelCol=test_assembled.columns[-1], metricName='f1')
print('F-1 Score of the classification model:',
      multi_evaluator.evaluate(predictions))
Ejemplo n.º 19
0
def load_model():
    rf = RandomForestClassificationModel.load(
        "s3://wineapp-parth/rf_model.model/")
    return rf
spark_session = SparkSession.builder.master("local").appName(
    "wineClasssification").getOrCreate()

print("\nProgram has started : \n")

##--------------------------------------         code to read dataset               ------------------------##
testDataframe = spark_session.read.csv('TestDataset.csv',
                                       header='true',
                                       inferSchema='true',
                                       sep=';')
feature = [c for c in testDataframe.columns if (c not in 'quality')]
assembler_test = VectorAssembler(inputCols=feature, outputCol="features")
test_trans = assembler_test.transform(testDataframe)

##--------------------------------------         code to load model                ------------------------##
model = RandomForestClassificationModel.load("model")

##--------------------------------------         code to predict                ------------------------##
predictions = model.transform(test_trans)

##--------------------------------------         code to print accuracy                ------------------------##
accuracy = MulticlassClassificationEvaluator(
    labelCol="quality", predictionCol="prediction",
    metricName="accuracy").evaluate(predictions)
print("Testing- Accuracy Error = %g" % (1.0 - accuracy))

transformed_data = model.transform(test_trans)
print(
    MulticlassClassificationEvaluator(labelCol="quality",
                                      predictionCol="prediction",
                                      metricName="accuracy").getMetricName(),
Ejemplo n.º 21
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml import PipelineModel
import time
#Load in onehotencoder and rf model
spark = SparkSession.builder.getOrCreate()
print("In load_model:")
loadtime = time.time()
rf_pipeline = PipelineModel.load("/home/ubuntu/pipeline")
rf_model = RandomForestClassificationModel.load("/home/ubuntu/model_rf")
print("after load_model: %s seconds" % (time.time() - loadtime))


def change_type(df):
    df = df.withColumn('credit_score', col('credit_score').cast(IntegerType()))
    df = df.withColumn('original_dti', col('original_dti').cast(IntegerType()))
    df = df.withColumn('original_upb', col('original_upb').cast(IntegerType()))
    df = df.withColumn('original_ltv', col('original_ltv').cast(IntegerType()))
    df = df.withColumn('original_interest_rate',
                       col('original_interest_rate').cast(DoubleType()))
    df = df.withColumn('number_of_units',
                       col('number_of_units').cast(IntegerType()))
    df = df.withColumn('mip', col('mip').cast(IntegerType()))
    return df
Ejemplo n.º 22
0
    
sensorImportancesPD = pd.DataFrame.from_records(list(sensorImportances.items()), columns=['Sensor','Importance (%)'])\
  .sort_values('Importance (%)')
    
sb.set_color_codes("pastel")
sb.barplot(x="Importance (%)", y="Sensor", 
           data=sensorImportancesPD,
           label="Total", color="b")

# #### Model Saving/Loading
# We can save models and pipelines for re-use later 
model.bestModel.write().overwrite().save(path='rf_sensor_maintenance.mdl')
!rm -rf rf_sensor_maintenance.mdl
!hdfs dfs -get models/rf_sensor_maintenance.mdl

newModel = RandomForestClassificationModel.load('rf_sensor_maintenance.mdl')
predictions = newModel.transform(li.transform(va))
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

# Let's see how much maintenance we could have saved if we used this model
def f(actual, predicted, cost):
    if actual==predicted:
        if actual=='Corrective':
          return 0
        elif actual=='Preventive':
          return cost
        elif actual=='Healthy':
          return 30000
    else:
        return cost
Ejemplo n.º 23
0
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import RandomForestClassificationModel

spark = SparkSession.builder.master("local").appName(
    "wineClasssification").getOrCreate()

testDf = spark.read.format('csv').options(
    header='true', inferSchema='true',
    delimiter=';').csv("s3://cs643/TrainingDataset.csv")
feature = [c for c in testDf.columns if c != 'quality']
assembler_test = VectorAssembler(inputCols=feature, outputCol="features")
test_trans = assembler_test.transform(testDf)

model = RandomForestClassificationModel.load(
    "s3://cs643/wine_train_model.model")

predictions = model.transform(test_trans)

eval = MulticlassClassificationEvaluator(labelCol='""""quality"""""',
                                         predictionCol="prediction",
                                         metricName="accuracy")
accuracy = eval.evaluate(predictions)
print("accuracy test Error = %g" % (1.0 - accuracy))

from pyspark.mllib.evaluation import MulticlassMetrics
transformed_data = model.transform(test_trans)
print(eval.getMetricName(), 'accuracy:', eval.evaluate(transformed_data))

eval1 = MulticlassClassificationEvaluator(labelCol='""""quality"""""',
                                          predictionCol="prediction",
sc.setLogLevel("ERROR")

app = Flask(__name__)

schema = StructType([
    StructField("sepal_length", FloatType()),
    StructField("sepal_width", FloatType()),
    StructField("petal_length", FloatType()),
    StructField("petal_width", FloatType()),
    StructField("class", StringType())
])

predict_schema = StructType(schema.fields[:-1])

pipelineModel = PipelineModel.load("api/sparksaves/pipelineModel")
rfModel = RandomForestClassificationModel.load("api/sparksaves/rfModel")

spark = SparkSession.builder.getOrCreate()


@app.route('/get_prediction', methods=['POST'])
def calc_prob():
    """Calculate probability for species."""
    input_features = [[
        float(request.json["sepal_length"]),
        float(request.json["sepal_width"]),
        float(request.json["petal_length"]),
        float(request.json["petal_width"])
    ]]

    predict_df = spark.createDataFrame(data=input_features,
print(rf_accuracy)

rf_precision=MulticlassClassificationEvaluator(labelCol='affairs',metricName='weightedPrecision').evaluate(rf_predictions)
print('The precision rate on test data is {0:.0%}'.format(rf_precision))

rf_precision

rf_auc=BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions)
print(rf_auc)

# Feature importance
rf_classifier.featureImportances
df.schema["features"].metadata["ml_attr"]["attrs"]

# Save the model 
rf_classifier.save("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model")

from pyspark.ml.classification import RandomForestClassificationModel

rf=RandomForestClassificationModel.load("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model")
test_df.show(5)
model_preditions=rf.transform(test_df)
model_preditions.show()

single_df = spark.createDataFrame([[5.0,33.0,5.0,1.0,5.0,0.0]], ['rate_marriage', 'age', 'yrs_married', 'children', 'religious', 'affairs'])
single_df = df_assembler.transform(single_df)
single_df = single_df.select(['features','affairs'])

model_predition=rf.transform(single_df)
model_predition.show()
Ejemplo n.º 26
0
    #remove punctuation
    pp_udf = udf(preprocess, ArrayType(StringType()))
    words = ads_free.withColumn('Words', pp_udf(ads_free.Text))

    #remove stop words
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    removed = remover.transform(words)

    params_path = '../tmp/{}'

    #Load trained hashing frequency and transform
    hf_path = params_path.format('hf')
    hashingTF = HashingTF.load(hf_path)
    featureized = hashingTF.transform(removed)

    #Load trained hashing frequency and transform
    idf_path = params_path.format('idfmodel')
    idfmodel = IDFModel.load(idf_path)
    result = idfmodel.transform(featureized)

    #load rf model and predict
    rf_path = params_path.format('rf')
    rf = RandomForestClassificationModel.load(rf_path)
    prediction = rf.transform(result)

    path_to_save = '../tmp/twitterstream_test_prediction.json'
    prediction.write.json(path_to_save)

    #test whether json is written
    test = spark.read.json(path_to_save)
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer
from pyspark.ml.classification import RandomForestClassificationModel

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .getOrCreate()

    # Load full data
    logs = spark.read.parquet("hdfs://devenv/user/spark/spark_mllib_101/ec_web_logs_analysis/data/")

    # Age group prediction
    # Load age group model
    age_group_model = RandomForestClassificationModel.load(
        "hdfs://devenv/user/spark/spark_mllib_101/ec_web_logs_analysis/model_age_group_prediction/")
    # +---------+-----------------+
    # |age_group|age_group_indexed|
    # +---------+-----------------+
    # | under 20|              2.0|
    # |  over 50|              3.0|
    # |    21-35|              0.0|
    # |    36-50|              1.0|
    # +---------+-----------------+

    # Prepare features and preprocessing
    data_prep = logs.select("device_id", "product_category_id", "device_type", "connect_type", "age_group")

    data_prep = VectorAssembler(inputCols=["product_category_id", "device_type", "connect_type"],
                                outputCol="features").transform(data_prep)
        #reading the saved countvector model
        cv = CountVectorizerModel.load(args.model_path + '/countvector_model')
        #transforming test data to count vector
        testing_data = cv.transform(testing_data)
        #saving the transformed data as parquet file
        testing_data.write.parquet(args.model_path + '/testingdata.parquet')

        print(
            '********************* after cv transformation *****************')
        print(
            '********************* after cv transformation *****************')
        print(
            '********************* after cv transformation  *****************')

        #reading the saved random forest model
        rfModel = RandomForestClassificationModel.load(args.model_path +
                                                       '/rfmodel')
        #getting the predictions
        predictions = predict(rfModel, testing_data)

        #saving the predictions as parquet file
        predictions.write.parquet(args.model_path + '/predictions.parquet')

        print('********************* after predicitons  *****************')
        print('********************* after predicitons  *****************')
        print('********************* Done  *****************')

    else:
        print("Enter correct mode (train or test)")
def main(base_path):

    APP_NAME = "make_predictions_streaming.py"

    # Process data every 10 seconds
    PERIOD = 10
    BROKERS = 'localhost:9092'
    PREDICTION_TOPIC = 'flight_delay_classification_request'

    try:
        sc and ssc
    except NameError as e:
        import findspark

        # Add the streaming package and initialize
        findspark.add_packages(
            ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
        findspark.init()

        import pyspark
        import pyspark.sql
        import pyspark.streaming

        conf = SparkConf().set("spark.default.parallelism", 1)
        sc = SparkContext(
            appName="Agile Data Science: PySpark Streaming 'Hello, World!'",
            conf=conf)
        ssc = StreamingContext(sc, PERIOD)
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load all models to be used in making predictions
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string field vectorizer pipelines into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Process Prediction Requests in Streaming
    #
    stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], {
        "metadata.broker.list": BROKERS,
        "group.id": "0",
    })

    object_stream = stream.map(lambda x: json.loads(x[1]))
    object_stream.pprint()

    row_stream = object_stream.map(
        lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']),
                      Origin=x['Origin'],
                      Distance=x['Distance'],
                      DayOfMonth=x['DayOfMonth'],
                      DayOfYear=x['DayOfYear'],
                      UUID=x['UUID'],
                      DepDelay=x['DepDelay'],
                      DayOfWeek=x['DayOfWeek'],
                      FlightNum=x['FlightNum'],
                      Dest=x['Dest'],
                      Timestamp=iso8601.parse_date(x['Timestamp']),
                      Carrier=x['Carrier']))
    row_stream.pprint()

    #
    # Create a dataframe from the RDD-based object stream
    #

    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # Add a Route variable to replace FlightNum
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # Vectorize string fields with the corresponding pipeline for that column
        # Turn category fields into categoric feature vectors, then drop intermediate fields
        for column in [
                "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
                "Dest", "Route"
        ]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # Vectorize numeric columns: DepDelay, Distance and index columns
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # Inspect the vectors
        final_vectorized_features.show()

        # Drop the individual index columns
        index_columns = [
            "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
            "DayOfYear_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # Inspect the finalized features
        final_vectorized_features.show()

        # Make the prediction
        predictions = rfc.transform(final_vectorized_features)

        # Drop the features vector and prediction metadata to give the original fields
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # Inspect the output
        final_predictions.show()

        # Store to Mongo
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )

    # Do the classification and store to Mongo
    row_stream.foreachRDD(classify_prediction_requests)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 30
0
    #load model
    if algoName == "LogisticRegression":
        from pyspark.ml.classification import LogisticRegressionModel
        model = LogisticRegressionModel.load(modelPath)
    elif algoName == "LinearRegression":
        from pyspark.ml.regression import LinearRegressionModel
        model = LinearRegressionModel.load(modelPath)
    elif algoName == "DecisionTreeClassification":
        from pyspark.ml.classification import DecisionTreeClassificationModel
        model = DecisionTreeClassificationModel.load(modelPath)
    elif algoName == "DecisionTreeRegression":
        from pyspark.ml.regression import DecisionTreeRegressionModel
        model = DecisionTreeRegressionModel.load(modelPath)
    elif algoName == "RandomForestClassification":
        from pyspark.ml.classification import RandomForestClassificationModel
        model = RandomForestClassificationModel.load(modelPath)
    elif algoName == "RandomForestRegression":
        from pyspark.ml.regression import RandomForestRegressionModel
        model = RandomForestRegressionModel.load(modelPath)
    elif algoName == "GBTClassification":
        from pyspark.ml.classification import GBTClassificationModel
        model = GBTClassificationModel.load(modelPath)
    elif algoName == "GBTRegression":
        from pyspark.ml.regression import GBTRegressionModel
        model = GBTRegressionModel.load(modelPath)

    #predict
    prediction = model.transform(data).select("prediction")

    #save
    prediction.write.format("csv").save(outputPath)
Ejemplo n.º 31
0
def get_model(model_version, spid, model_date):
    model_version_location = _get_model_version_folder(model_version)
    model_path = os.path.join(model_version_location, 'ctr_model_spid%d_%s' %
                              (spid, model_date)).replace('\\', '/')
    return RandomForestClassificationModel.load(model_path)