def load_Random_Model(dataset):
    print ("Accuracy of best RFC Model with CrossValidation:")
    evaluator = BinaryClassificationEvaluator()
    best_RFModel = RandomForestClassificationModel.load("model/RFM1/")
    predictions = best_RFModel.transform(dataset)
    accuracy = evaluator.evaluate(predictions)
    print "The  accuracy = %g" % accuracy
Esempio n. 2
0
def RandomForest(data):
    path = 'modelo_RandomForest/modelRandomForest'
    randomModel = RandomForestClassificationModel.load(path)
    predictions = randomModel.transform(data)
    print("RANDOM FOREST")
    predictions.select('Email', 'Identificador', 'Burnout_Antes', 'prediction',
                       'probability').show(truncate=False)
Esempio n. 3
0
def predict(test_path, model_name, output_path):
    if model_name is None:
        model_name = 'model'
    if output_path is None:
        output_path = os.path.join(dirname(os.getcwd()), 'predict.csv')

    model_path = os.path.join(dirname(os.getcwd()), 'models', model_name)

    spark = SparkSession \
        .builder \
        .master('local') \
        .appName('Logistic App') \
        .getOrCreate()

    # todo Delete the next line
    spark.sparkContext.setLogLevel('OFF')

    model = RandomForestClassificationModel.load(path=model_path)
    raw_data = spark.read.csv(test_path, header=True)

    dataset = mature_data(raw_data)

    prediction_df = model.transform(dataset).select(
        col('id'),
        col('prediction').cast('int'))
    prediction_df = prediction_df.toPandas()
    prediction_df.to_csv(output_path, index=False)
Esempio n. 4
0
    def read_model(self):

        if "LogisticRegression" in self.best_model_path:
            classifier = LogisticRegressionModel.load(self.best_model_path)

        elif "DecisionTree" in self.best_model_path:
            classifier = DecisionTreeClassificationModel.load(
                self.best_model_path)

        elif "RandomForest" in self.best_model_path:
            classifier = RandomForestClassificationModel.load(
                self.best_model_path)

        elif "LinearSVC" in self.best_model_path:
            classifier = LinearSVCModel.load(self.best_model_path)

        if "VGG16" in self.best_model_path:
            featurizer_name = "VGG16"

        elif "VGG19" in self.best_model_path:
            featurizer_name = "VGG19"

        elif "InceptionV3" in self.best_model_path:
            featurizer_name = "InceptionV3"

        elif "Xception" in self.best_model_path:
            featurizer_name = "Xception"

        elif "ResNet50" in self.best_model_path:
            featurizer_name = "ResNet50"

        return featurizer_name, classifier
Esempio n. 5
0
    def _load_models(self):
        hf_path = self.params_path.format('hf')
        idf_path = self.params_path.format('idfmodel')
        rf_path = self.params_path.format('rf')

        self.hashingTF = HashingTF.load(hf_path)
        self.idfmodel = IDFModel.load(idf_path)
        self.rf = RandomForestClassificationModel.load(rf_path)
def scoring_post_model(pargs, params):
    """
    Function to score the input data using the saved model.
    """

    # Load parameters
    label_class_type = configs['binary_or_multiclass']
    saved_model_path = data_paths[configs['saved_model_path']].format(
        run_mode=run['run_mode'], run_id=run['run_id'])
    scoring_filter_column = configs['scoring_filter_column']
    scoring_filter_date = datetime.datetime.strptime(
        str(configs['scoring_filter_date']), '%Y%m%d')
    feature_list_path = data_paths[configs['feature_list_path']]
    output_scored_data = data_paths[
        configs['scored_data_path']]  # scored data output

    if run['use_sample']:
        abo_dna_data = sqlContext.read.parquet(
            data_paths['abo_dna_sample'].format(run_mode=run['run_mode'],
                                                run_id=run['run_id']))
    else:
        abo_dna_data = sqlContext.read.parquet(
            data_paths['abo_dna_full_file'].format(run_mode=run['run_mode'],
                                                   run_id=run['run_id']))

    trained_model = None

    if label_class_type == "binary":
        trained_model = GBTClassificationModel.load(saved_model_path)

    else:
        trained_model = RandomForestClassificationModel.load(saved_model_path)

    # Select which subset of abo dna data we want to use for scoring
    abo_dna_data_scoring = abo_dna_data.filter(
        F.col(scoring_filter_column) >= scoring_filter_date)

    onehot_pipeline = PipelineModel.load(
        data_paths['migration_onehot_model'].format(run_mode=run['run_mode'],
                                                    run_id=run['run_id']))

    scoring_data, onehot_pipeline, final_feature_list = preprocess_migration_model_data(
        abo_dna_data_scoring, False, onehot_pipeline, None, None)

    # validate that the same input columns are here as training (except the label based columns)
    if final_feature_list != list(
            sqlContext.read.parquet(
                feature_list_path.format(run_mode=run['run_mode'],
                                         run_id=run['run_id'])).columns):
        raise ValueError("Mismatch in training input and test input.")

    # Produce scoring:
    scored_data = trained_model.transform(scoring_data)

    scored_data.write.parquet(output_scored_data.format(
        run_mode=run['run_mode'], run_id=run['run_id']),
                              mode='overwrite')
Esempio n. 7
0
def init():
    print("Begin function...", flush=True)

    global SPARK
    SPARK = SparkSession.builder.appName("DriftTest").getOrCreate()
    print("Spark variable:", SPARK, flush=True)

    global MODEL
    MODEL = RandomForestClassificationModel.load(
        "/hadoop/demo/titanic-spark/titanic")
 def pred_rf_model_spark(cls, model_dir, feature_col, df_new):
     print('model loading start')
     # model = GBTClassificationModel.load(model_dir)
     model = RandomForestClassificationModel.load(str(model_dir))
     # model = LinearSVCModel.load(model_dir)
     assembler = VectorAssembler(inputCols=feature_col,
                                 outputCol="features")
     # Set maxCategories so features with > 4 distinct values are treated as continuous.
     newData = assembler.transform(df_new)
     predictions = model.transform(newData)
     return predictions
Esempio n. 9
0
def RandomForest(data):
    path = 'modelo_RandomForest/modelRandomForest'
    randomModel = RandomForestClassificationModel.load(path)
    predictions = randomModel.transform(data)
    prediccion = predictions.select(
        'prediction', 'probability').rdd.flatMap(lambda x: x).collect()
    print(prediccion[0])
    if prediccion[0] == 1.0:
        prediccionLabel = 'FALSO'
    else:
        prediccionLabel = 'VERDADERO'

    return prediccionLabel, prediccion[1][0] * 100
def main2(spark, output):
    # STEP 3: Use Classifier to predict Latent Factor Vector and updated ALS Model

    model = RandomForestClassificationModel.load(
        'hdfs:/user/yh2857/short_rf.model')
    lfsdf = spark.read.parquet(
        'hdfs:/user/yh2857/model_frac_1/rank10_reg0.1_alpha0.01/itemFactors')
    idx = lfsdf.rdd.map(lambda row: row[0])
    features = lfsdf.rdd.map(lambda row: row[1])
    lfsdf = idx.zip(features.map(lambda x: Vectors.dense(x))).toDF(
        schema=['id', 'features'])

    # print(lfsdf.count(),lfsdf.select('id').distinct().count(),lfsdf)

    with open('kmean_centers.txt', 'rb') as f:
        centers = pickle.load(f)
    new_centers = []
    for i, c in enumerate(centers):
        new_centers.append([i, Vectors.dense(centers[0].tolist())])
    centerdf = spark.createDataFrame(
        pd.DataFrame(data=new_centers)).withColumnRenamed(
            '0', 'center_idx').withColumnRenamed('1', 'center_features')
    # print(centerdf.count(), centerdf)

    df_path = 'hdfs:/user/yh2857/coldstart_processed_short.parquet'
    new_df = spark.read.parquet(df_path).withColumnRenamed(
        'prediction', 'label')

    train, test = new_df.randomSplit([0.8, 0.2], 24)
    print(test.count(), test.select('item_index').distinct().count(), test)

    predictions = model.transform(test)
    # predictions.select("prediction").distinct().show()
    predicted = predictions.join(centerdf,
                                 predictions.prediction == centerdf.center_idx,
                                 'left')
    # predicted.show()

    original_lfs = lfsdf.join(predicted, lfsdf.id == predicted.item_index,
                              "leftanti")
    predicted = predicted.select('item_index',
                                 'center_features').withColumnRenamed(
                                     "center_features", 'features')

    print(original_lfs)
    print(predicted)
    updated_lfs = original_lfs.withColumnRenamed('id',
                                                 'item_index').union(predicted)
    # updated_lfs.show()
    output_file = 'hdfs:/user/yh2857/rank10_reg0.1_alpha0.1/itemFactors'
    updated_lfs.write.mode('overwrite').parquet(output_file)
    def fit(self, train):
        from pyspark.ml.feature import MinMaxScaler as minmax
        cols = [x for x in train.columns if x not in ['datetime','label']]

        train = train.fillna(0)
        train = train.withColumn('label', when(rand() > 0.5, 1).otherwise(0))

        print(train.show(n=5))

        assembler = VectorAssembler().setInputCols \
            (cols).setOutputCol("features")

        print('assembler')
        train = assembler.transform(train)
        train = train.fillna(0)
        train = train.drop(*cols)

        rf = RandomForestClassifier(labelCol="label", featuresCol="features", predictionCol='predictions', numTrees=10)

        print('assembler')
        # print(train.show(n=5))
        # train = assembler.transform(train)



        # Chain indexers and forest in a Pipeline
        train.show(n=5)

        # pipeline = Pipeline(stages=[rf])

        print('Train model.  This also runs the indexers.')
        model = rf.fit(train)

        # Save and load model
        model.write().overwrite().save('myRandomForestClassificationModel')
        sameModel = RandomForestClassificationModel.load('myRandomForestClassificationModel')

        print("make predictions")
        # Make predictions.
        predictions = model.transform(train)

        # Select example rows to display.
        predictions.select("predictions", "label", "features").show(5)

        # Select (prediction, true label) and compute test error
        evaluator = MulticlassClassificationEvaluator(
            labelCol="label", predictionCol="predictions", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))
Esempio n. 12
0
def main():
    spark = SparkSession \
       .builder \
       .appName("RandomForest") \
       .config("spark.executor.heartbeatInterval","60s")\
       .getOrCreate()

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    sc.setLogLevel("INFO")

    # Loading the test data
    df_test = spark.read.parquet(sys.argv[1])

    df_test, df_discard = df_test.randomSplit([0.2, 0.8])

    # Load the model
    rf_model = RandomForestClassificationModel.load(sys.argv[2])

    # Make the predictions
    predictions = rf_model.transform(df_test)

    #predictionsRDD=predictions.rdd

    #predictionsRDD.saveAsTextFile(sys.argv[3]+"output.text")

    evaluator_acc = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol="label", metricName="accuracy")
    accuracy = evaluator_acc.evaluate(predictions)

    print "accuracy *******************"
    print accuracy

    evaluator_pre = MulticlassClassificationEvaluator(
        predictionCol="prediction",
        labelCol="label",
        metricName="weightedPrecision")

    print "precision *******************"
    print evaluator_pre.evaluate(predictions)

    print "recall **********************"
    print MulticlassClassificationEvaluator(
        predictionCol="prediction",
        labelCol="label",
        metricName="weightedRecall").evaluate(predictions)
Esempio n. 13
0
def check_input(data) -> int:
    spark = SparkSession.builder.appName(
        'abc').enableHiveSupport().getOrCreate()
    sc = spark.sparkContext
    rdd = sc.parallelize([data])
    df = spark.read.json(rdd)
    rdd = sc.parallelize([data])
    df = spark.read.json(rdd)
    df_assembler = VectorAssembler(
        inputCols=['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
        outputCol="features")
    df = df_assembler.transform(df)
    model_df = df.select('features')
    rf = RandomForestClassificationModel.load("/home/admin/Downloads/RF_model")
    model_preditions = rf.transform(model_df)
    model_preditions = model_preditions.toPandas()['prediction'].values.tolist(
    )
    return model_preditions[0]
Esempio n. 14
0
def random_forest_classifier():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    rf = RandomForestClassifier(numTrees=3,
                                maxDepth=2,
                                labelCol="indexed",
                                seed=42)
    model = rf.fit(td)
    # model.featureImportances
    # # SparseVector(1, {0: 1.0})
    # allclose(model.treeWeights, [1.0, 1.0, 1.0])
    # # True
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    result = model.transform(test0).head()
    # result.prediction
    # # 0.0
    # numpy.argmax(result.probability)
    # # 0
    # numpy.argmax(result.rawPrediction)
    # # 0
    # test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
    # model.transform(test1).head().prediction
    # # 1.0
    # model.trees
    # # [DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...]
    temp_path = "."
    rfc_path = temp_path + "/rfc"
    rf.write().overwrite().save(rfc_path)
    rf2 = RandomForestClassifier.load(rfc_path)
    # rf2.getNumTrees()
    # # 3
    model_path = temp_path + "/rfc_model"
    model.write().overwrite().save(model_path)
    model2 = RandomForestClassificationModel.load(model_path)
def init(path="./"):
    global indexModel, ohPipelineModel, scaler, mlModel, info, spark
    # start spark session
    spark = pyspark.sql.SparkSession.builder.appName('scoring').getOrCreate()
    # load the models
    stringIndexModelFile = path + 'stringIndexModel'
    oneHotEncoderModelFile = path + 'oneHotEncoderModel'
    featureScaleModelFile = path + 'featureScaleModel'
    scaler = StandardScalerModel.load(featureScaleModelFile)
    ohPipelineModel = PipelineModel.load(oneHotEncoderModelFile)
    indexModel = PipelineModel.load(stringIndexModelFile)

    mlModelFile = path + 'mlModel'
    mlModel = RandomForestClassificationModel.load(mlModelFile)

    infoFile = path + 'info'
    info = None
    # load info
    with open(infoFile, 'rb') as handle:
        info = pickle.load(handle)
    def predict(self, test):

        cols = [x for x in test.columns if x not in ['datetime', 'label']]
        test = test.fillna(0)
        print(test.printSchema())
        print('Test Columns : ' + str(len(test.columns)))
        print('Test Rows : ' + str(test.count()))

        assembler = VectorAssembler().setInputCols \
            (cols).setOutputCol("features")

        print('assembler')
        test = assembler.transform(test)
        test = test.fillna(0)
        test = test.drop(*cols)


        rf = RandomForestClassificationModel.load('myRandomForestClassificationModel')
        preds = rf.transform(test)
        print(preds.printSchema())
        return preds
Esempio n. 17
0
def main():
    # spark = SparkSession.builder.appName('google-play-store-streamer').getOrCreate()
    sc = SparkContext(appName="PysparkStreaming").getOrCreate()
    ssc = StreamingContext(sc, 3)

    # Load Model
    model = RandomForestClassificationModel.load(MODEL_PATH)

    def parseStream(rdd):
        if not rdd.isEmpty():
            df = sc.read.json(rdd)
            df.show()
            # Vectorize data
            feature_cols = df.columns
            feature_cols.remove('Installs indexed')
            assembler = VectorAssembler(inputCols=feature_cols,
                                        outputCol="features",
                                        handleInvalid="error")
            pipeline = Pipeline(stages=[assembler])
            outputModel = pipeline.fit(df)
            output = outputModel.transform(df)
            final_data = output.select("features", "Installs indexed")
            # Predict
            predictions = model.transform(final_data)
            evaluator = MulticlassClassificationEvaluator(
                labelCol="Installs indexed",
                predictionCol="prediction",
                metricName="accuracy")
            accuracy = evaluator.evaluate(predictions)
            print("Random forest test Error = %g" % (1.0 - accuracy))
            randomForestError = (1.0 - accuracy)
            print(randomForestError)

    stream_data = ssc.textFileStream('StreamData/')
    stream_data.foreachRDD(lambda rdd: parseStream(rdd))

    ssc.start()
    ssc.awaitTermination()
Esempio n. 18
0
def sendRecord(df):

    from pyspark.ml.classification import RandomForestClassificationModel
    sameModel = RandomForestClassificationModel.load("randomForest.model")

    df = df.withColumn("amount", df["amount"].cast(FloatType()))
    df = df.withColumn("newbalanceDest",
                       df["newbalanceDest"].cast(FloatType()))
    df = df.withColumn("newbalanceOrig",
                       df["newbalanceOrig"].cast(FloatType()))
    df = df.withColumn("oldbalanceDest",
                       df["oldbalanceDest"].cast(FloatType()))
    df = df.withColumn("oldbalanceOrg", df["oldbalanceOrg"].cast(FloatType()))
    df = df.withColumn("isFlaggedFraud",
                       df["isFlaggedFraud"].cast(IntegerType()))
    df = df.withColumn("step", df["step"].cast(IntegerType()))
    df = df.withColumn("Type", df["Type"].cast(IntegerType()))

    assembler = VectorAssembler(inputCols=[
        "Type", "amount", "newbalanceDest", "newbalanceOrig", "oldbalanceDest",
        "oldbalanceOrg", "step"
    ],
                                outputCol="features")

    output = assembler.transform(df).select("features")

    predictions = sameModel.transform(output)

    pr = predictions.select("prediction")

    pr = pr.rdd

    if (str(pr.collect() == [Row(prediction=1.0)]) == "True"):
        print("FRAUD!!!!")
    else:
        print("Not Fraud")
        #reading the saved countvector model
        cv = CountVectorizerModel.load(args.model_path + '/countvector_model')
        #transforming test data to count vector
        testing_data = cv.transform(testing_data)
        #saving the transformed data as parquet file
        testing_data.write.parquet(args.model_path + '/testingdata.parquet')

        print(
            '********************* after cv transformation *****************')
        print(
            '********************* after cv transformation *****************')
        print(
            '********************* after cv transformation  *****************')

        #reading the saved random forest model
        rfModel = RandomForestClassificationModel.load(args.model_path +
                                                       '/rfmodel')
        #getting the predictions
        predictions = predict(rfModel, testing_data)

        #saving the predictions as parquet file
        predictions.write.parquet(args.model_path + '/predictions.parquet')

        print('********************* after predicitons  *****************')
        print('********************* after predicitons  *****************')
        print('********************* Done  *****************')

    else:
        print("Enter correct mode (train or test)")
Esempio n. 20
0
predictions = rForestModel.transform(pTestDF)

# %%
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1")
evaluator.evaluate(predictions)

# %%

lr = LogisticRegression(featuresCol='features', labelCol='class')
lrModel = lr.fit(pTrainDF)
predictionsLR = lrModel.transform(pTestDF)
evaluator.evaluate(predictionsLR)

# %%
naiveBayes = NaiveBayes(featuresCol='features', labelCol='class')
naiveModel = naiveBayes.fit(pTrainDF)
predictionsNaive = naiveModel.transform(pTestDF)
evaluator.evaluate(predictionsNaive)


# %%
pipelineModel.save('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/pipelineW2V')
rForestModel.save('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/rForest')

#%%
pipelineModel = PipelineModel.load('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/pipelineW2V')
rForestModel = RandomForestClassificationModel.load('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/rForest')


# %%
Esempio n. 21
0
    #remove punctuation
    pp_udf = udf(preprocess, ArrayType(StringType()))
    words = ads_free.withColumn('Words', pp_udf(ads_free.Text))

    #remove stop words
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    removed = remover.transform(words)

    params_path = '../tmp/{}'

    #Load trained hashing frequency and transform
    hf_path = params_path.format('hf')
    hashingTF = HashingTF.load(hf_path)
    featureized = hashingTF.transform(removed)

    #Load trained hashing frequency and transform
    idf_path = params_path.format('idfmodel')
    idfmodel = IDFModel.load(idf_path)
    result = idfmodel.transform(featureized)

    #load rf model and predict
    rf_path = params_path.format('rf')
    rf = RandomForestClassificationModel.load(rf_path)
    prediction = rf.transform(result)

    path_to_save = '../tmp/twitterstream_test_prediction.json'
    prediction.write.json(path_to_save)

    #test whether json is written
    test = spark.read.json(path_to_save)
def main(iso_date, base_path):

  APP_NAME = "make_predictions.py"
  
  # SparkSession이 없으면 환경 생성
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # 파이프라인에 모든 모델을 적재
  #
  
  # 도착 지연 구간 설정 모델을 적재
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # 모든 문자열 인덱서를 dict에 적재
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model
  
  # 수치 벡터 어셈블러 적재
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)
    
  # 분류 모델 적재
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
      base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # 요청을 훈련 데이터로부터 변환을 통해 실행
  #
  
  # 쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기
  today_dt = iso8601.parse_date(iso_date)
  rounded_today = today_dt.date()쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기
  iso_today = rounded_today.isoformat()

  # 해당 날짜의 입력 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조
  today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
    base_path,
    iso_today
  )

  from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField

  schema = StructType([
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Timestamp", TimestampType(), True),
  ])
  
  prediction_requests = spark.read.json(today_input_path, schema=schema)
  prediction_requests.show()

  #
  # FlightNum을 대체할 Route 변수 추가
  #
  
  from pyspark.sql.functions import lit, concat
  prediction_requests_with_route = prediction_requests.withColumn(
    'Route',
    concat(
      prediction_requests.Origin,
      lit('-'),
      prediction_requests.Dest
    )
  )
  prediction_requests_with_route.show(6)
  
  #  해당 열에 대응하는 인덱서로 문자열 필드를 인덱싱
  for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
                 "Origin", "Dest", "Route"]:
    string_indexer_model = string_indexer_models[column]
    prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
      
  # 수치열 벡터화: DepDelay, Distance
  final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
  
  # 명목형 필드를 위한 인덱스 제거
  index_columns = ["Carrier_index", "DayOfMonth_index","DayOfWeek_index",
                   "DayOfYear_index", "Origin_index", "Origin_index",
                   "Dest_index", "Route_index"]
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)

  # 확정된 특징 검사
  final_vectorized_features.show()
  
  # 예측 생성
  predictions = rfc.transform(final_vectorized_features)
  
  # 원래 필드를 제공하기 위해 특징 벡터와 예측 메타데이터를 제거
  predictions = predictions.drop("Features_vec")
  final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
  # 결과 검사
  final_predictions.show()
  
  # 해당 날짜의 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조
  today_output_path = "{}/data/prediction_results_daily.json/{}".format(
    base_path,
    iso_today
  )
  
  # 일별 구간에 결과 저장
  final_predictions.repartition(1).write.mode("overwrite").json(today_output_path)
sc.setLogLevel("ERROR")

app = Flask(__name__)

schema = StructType([
    StructField("sepal_length", FloatType()),
    StructField("sepal_width", FloatType()),
    StructField("petal_length", FloatType()),
    StructField("petal_width", FloatType()),
    StructField("class", StringType())
])

predict_schema = StructType(schema.fields[:-1])

pipelineModel = PipelineModel.load("api/sparksaves/pipelineModel")
rfModel = RandomForestClassificationModel.load("api/sparksaves/rfModel")

spark = SparkSession.builder.getOrCreate()


@app.route('/get_prediction', methods=['POST'])
def calc_prob():
    """Calculate probability for species."""
    input_features = [[
        float(request.json["sepal_length"]),
        float(request.json["sepal_width"]),
        float(request.json["petal_length"]),
        float(request.json["petal_width"])
    ]]

    predict_df = spark.createDataFrame(data=input_features,
Esempio n. 24
0
    def label_failure_modes(cls, site, did, rd_item, df, model_dir, sc):
        '''
        : param site: site, e.g. 'fab15', 'fab10'
        : did: design id, e.g. 'Z32D'
        : rd_item: rd bin in string format, e.g. 'rdC'
        '''
        start_time = time.time()
        #Convert to Pandas dataframe
        #        df = df.toPandas()
        #        if 'FBD_REGION' in df.columns:
        #            df['FBD_REGION'] = df['FBD_REGION'].apply(lambda x : cls.label_zone(x))

        labelled_failure_modes = []
        df = df.withColumn("row_id", F.monotonically_increasing_id())
        model_features_list, model_name_list, model_dir_list = cls.__read_model_name(
            site, did, rd_item, model_dir)
        print(model_dir_list)
        if len(model_name_list) > 0:
            for name, features, dirname in zip(model_name_list,
                                               model_features_list,
                                               model_dir_list):
                features_missing = [e for e in features if e not in df.columns]
                if len(features_missing) > 0:
                    print('Features %s missing for model %s' %
                          (','.join(features_missing), name))
                else:
                    print(dirname)
                    print(features)
                    try:
                        model = RandomForestClassificationModel.load(
                            str(dirname))
                        # model = LinearSVCModel.load(model_dir)
                        assembler = VectorAssembler(inputCols=features,
                                                    outputCol="features")
                        # Set maxCategories so features with > 4 distinct values are treated as continuous.
                        newData = assembler.transform(df)
                        df_i = model.transform(newData)
                        #df_i = cls.pred_rf_model_spark(dirname, feature, name, df)
                        df_i = df_i.withColumnRenamed("prediction", name)
                        df = df.join(df_i.select("row_id", name), ("row_id"))
                        labelled_failure_modes.append(name)
                        print('Labelling done for: ', name)
                    except:
                        print('Labelling failed for: ', name)
            if len(labelled_failure_modes) > 0:
                df = df.withColumn(
                    'total', sum(df[col] for col in labelled_failure_modes))
                df_labelled = df.filter(df.total > 0)
                df_unlabelled = df.filter(df.total == 0)
            else:
                df_labelled = []
                df_unlabelled = df
        else:
            df_labelled = []
            df_unlabelled = df
            print('No models found for: %s, %s, %s' % (site, did, rd_item))

        print('Labelling time = ', time.time() - start_time)
        start_time = time.time()
        if df_labelled != []:
            df_labelled = df_labelled.toPandas()
        else:
            df_labelled = pd.DataFrame()
        df_unlabelled = df_unlabelled.toPandas()
        print('Pandas df conversion time = ', time.time() - start_time)
        return df_labelled, df_unlabelled, labelled_failure_modes
Esempio n. 25
0
    
sensorImportancesPD = pd.DataFrame.from_records(list(sensorImportances.items()), columns=['Sensor','Importance (%)'])\
  .sort_values('Importance (%)')
    
sb.set_color_codes("pastel")
sb.barplot(x="Importance (%)", y="Sensor", 
           data=sensorImportancesPD,
           label="Total", color="b")

# #### Model Saving/Loading
# We can save models and pipelines for re-use later 
model.bestModel.write().overwrite().save(path='rf_sensor_maintenance.mdl')
!rm -rf rf_sensor_maintenance.mdl
!hdfs dfs -get models/rf_sensor_maintenance.mdl

newModel = RandomForestClassificationModel.load('rf_sensor_maintenance.mdl')
predictions = newModel.transform(li.transform(va))
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

# Let's see how much maintenance we could have saved if we used this model
def f(actual, predicted, cost):
    if actual==predicted:
        if actual=='Corrective':
          return 0
        elif actual=='Preventive':
          return cost
        elif actual=='Healthy':
          return 30000
    else:
        return cost
spark_session = SparkSession.builder.master("local").appName(
    "wineClasssification").getOrCreate()

print("\nProgram has started : \n")

##--------------------------------------         code to read dataset               ------------------------##
testDataframe = spark_session.read.csv('TestDataset.csv',
                                       header='true',
                                       inferSchema='true',
                                       sep=';')
feature = [c for c in testDataframe.columns if (c not in 'quality')]
assembler_test = VectorAssembler(inputCols=feature, outputCol="features")
test_trans = assembler_test.transform(testDataframe)

##--------------------------------------         code to load model                ------------------------##
model = RandomForestClassificationModel.load("model")

##--------------------------------------         code to predict                ------------------------##
predictions = model.transform(test_trans)

##--------------------------------------         code to print accuracy                ------------------------##
accuracy = MulticlassClassificationEvaluator(
    labelCol="quality", predictionCol="prediction",
    metricName="accuracy").evaluate(predictions)
print("Testing- Accuracy Error = %g" % (1.0 - accuracy))

transformed_data = model.transform(test_trans)
print(
    MulticlassClassificationEvaluator(labelCol="quality",
                                      predictionCol="prediction",
                                      metricName="accuracy").getMetricName(),
# Calculate and print Recall score for Decision Tree Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
dtcWeightedRecall = evaluator.evaluate(dtcPredictions)
print("Decision Tree weightedRecall Error = %g" % (dtcWeightedRecall))

# Train a RandomForest algorithm
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
rfm = rf.fit(trainingData)

# Save trained Logistic Regression Model to s3 Bucket for future use
rfm.save('s3://expedia-hotel-recommendations-workflow/rfm_model')

# Load Pre-Trained Logistic Regression Model to illistrate how model will be imported for future use
rfModel = RandomForestClassificationModel.load("s3://expedia-hotel-recommendations-workflow/rfm_model")

# Make predictions with Random Forest model
rfPredictions = rfModel.transform(testData)

# Calculate and print Accuracy score for Random Forest Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
rfAccuracy = evaluator.evaluate(rfPredictions)
print("Random Forest accuracy Error = %g" % (rfAccuracy))

# Calculate and print F1 score for Random Forest Algorithm
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
rfF1 = evaluator.evaluate(rfPredictions)
print("Random Forest f1 Error = %g" % (rfF1))
Esempio n. 28
0
def main(iso_date, base_path):

    APP_NAME = "make_predictions.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # Load each and every model in the pipeline
    #

    # Load the arrival delay bucketizer
    from pyspark.ml.feature import Bucketizer
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)

    # Load all the string indexers into a dict
    from pyspark.ml.feature import StringIndexerModel

    string_indexer_models = {}
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
            base_path, column)
        string_indexer_model = StringIndexerModel.load(
            string_indexer_model_path)
        string_indexer_models[column] = string_indexer_model

    # Load the numeric vector assembler
    from pyspark.ml.feature import VectorAssembler
    vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(
        base_path)
    vector_assembler = VectorAssembler.load(vector_assembler_path)

    # Load the classifier model
    from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
    random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        base_path)
    rfc = RandomForestClassificationModel.load(random_forest_model_path)

    #
    # Run the requests through the transformations from training
    #

    # Get today and tomorrow's dates as iso strings to scope query
    today_dt = iso8601.parse_date(iso_date)
    rounded_today = today_dt.date()
    iso_today = rounded_today.isoformat()

    # Build the day's input path: a date based primary key directory structure
    today_input_path = "{}/data/prediction_tasks_daily.json/{}".format(
        base_path, iso_today)

    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField

    schema = StructType([
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Timestamp", TimestampType(), True),
    ])

    prediction_requests = spark.read.json(today_input_path, schema=schema)
    prediction_requests.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests.withColumn(
        'Route',
        concat(prediction_requests.Origin, lit('-'), prediction_requests.Dest))
    prediction_requests_with_route.show(6)

    # Index string fields with the corresponding indexer for that column
    for column in [
            "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin",
            "Dest", "Route"
    ]:
        string_indexer_model = string_indexer_models[column]
        prediction_requests_with_route = string_indexer_model.transform(
            prediction_requests_with_route)

    # Vectorize numeric columns: DepDelay and Distance
    final_vectorized_features = vector_assembler.transform(
        prediction_requests_with_route)

    # Drop the indexes for the nominal fields
    index_columns = [
        "Carrier_index", "DayOfMonth_index", "DayOfWeek_index",
        "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index",
        "Route_index"
    ]
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)

    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop(
        "rawPrediction").drop("probability")

    # Inspect the output
    final_predictions.show()

    # Build the day's output path: a date based primary key directory structure
    today_output_path = "{}/data/prediction_results_daily.json/{}".format(
        base_path, iso_today)

    # Save the output to its daily bucket
    final_predictions.repartition(1).write.mode("overwrite").json(
        today_output_path)
def main(base_path):

  APP_NAME = "make_predictions_streaming.py"

  # Process data every 10 seconds
  PERIOD = 10
  BROKERS = 'localhost:9092'
  PREDICTION_TOPIC = 'flight_delay_classification_request'
  
  try:
    sc and ssc
  except NameError as e:
    import findspark

    # Add the streaming package and initialize
    findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
    findspark.init()
    
    import pyspark
    import pyspark.sql
    import pyspark.streaming
  
    conf = SparkConf().set("spark.default.parallelism", 1)
    sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
    ssc = StreamingContext(sc, PERIOD)
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # Load all models to be used in making predictions
  #
  
  # Load the arrival delay bucketizer
  from pyspark.ml.feature import Bucketizer
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
  
  # Load all the string field vectorizer pipelines into a dict
  from pyspark.ml.feature import StringIndexerModel
  
  string_indexer_models = {}
  for column in ["Carrier", "Origin", "Dest", "Route"]:
    string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
      base_path,
      column
    )
    string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
    string_indexer_models[column] = string_indexer_model

  # Load the numeric vector assembler
  from pyspark.ml.feature import VectorAssembler
  vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
  vector_assembler = VectorAssembler.load(vector_assembler_path)

  # Load the classifier model
  from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
  random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
    base_path
  )
  rfc = RandomForestClassificationModel.load(
    random_forest_model_path
  )
  
  #
  # Process Prediction Requests in Streaming
  #
  stream = KafkaUtils.createDirectStream(
    ssc,
    [PREDICTION_TOPIC],
    {
      "metadata.broker.list": BROKERS,
      "group.id": "0",
    }
  )

  object_stream = stream.map(lambda x: json.loads(x[1]))
  object_stream.pprint()
  
  row_stream = object_stream.map(
    lambda x: Row(
      FlightDate=iso8601.parse_date(x['FlightDate']),
      Origin=x['Origin'],
      Distance=x['Distance'],
      DayOfMonth=x['DayOfMonth'],
      DayOfYear=x['DayOfYear'],
      UUID=x['UUID'],
      DepDelay=x['DepDelay'],
      DayOfWeek=x['DayOfWeek'],
      FlightNum=x['FlightNum'],
      Dest=x['Dest'],
      Timestamp=iso8601.parse_date(x['Timestamp']),
      Carrier=x['Carrier']
    )
  )
  row_stream.pprint()

  #
  # Create a dataframe from the RDD-based object stream
  #

  def classify_prediction_requests(rdd):
  
    from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
  
    prediction_request_schema = StructType([
      StructField("Carrier", StringType(), True),
      StructField("DayOfMonth", IntegerType(), True),
      StructField("DayOfWeek", IntegerType(), True),
      StructField("DayOfYear", IntegerType(), True),
      StructField("DepDelay", DoubleType(), True),
      StructField("Dest", StringType(), True),
      StructField("Distance", DoubleType(), True),
      StructField("FlightDate", DateType(), True),
      StructField("FlightNum", StringType(), True),
      StructField("Origin", StringType(), True),
      StructField("Timestamp", TimestampType(), True),
      StructField("UUID", StringType(), True),
    ])
    
    prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema)
    prediction_requests_df.show()

    #
    # Add a Route variable to replace FlightNum
    #

    from pyspark.sql.functions import lit, concat
    prediction_requests_with_route = prediction_requests_df.withColumn(
      'Route',
      concat(
        prediction_requests_df.Origin,
        lit('-'),
        prediction_requests_df.Dest
      )
    )
    prediction_requests_with_route.show(6)
  
    # Vectorize string fields with the corresponding pipeline for that column
    # Turn category fields into categoric feature vectors, then drop intermediate fields
    for column in ["Carrier", "Origin", "Dest", "Route"]:
      string_indexer_model = string_indexer_models[column]
      prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route)
  
    # Vectorize numeric columns: DepDelay, Distance and index columns
    final_vectorized_features = vector_assembler.transform(prediction_requests_with_route)
    
    # Inspect the vectors
    final_vectorized_features.show()
  
    # Drop the individual index columns
    index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"]
    for column in index_columns:
      final_vectorized_features = final_vectorized_features.drop(column)
  
    # Inspect the finalized features
    final_vectorized_features.show()
  
    # Make the prediction
    predictions = rfc.transform(final_vectorized_features)
  
    # Drop the features vector and prediction metadata to give the original fields
    predictions = predictions.drop("Features_vec")
    final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability")
  
    # Inspect the output
    final_predictions.show()
  
    # Store to Mongo
    if final_predictions.count() > 0:
      final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
        "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
      )
  
  # Do the classification and store to Mongo
  row_stream.foreachRDD(classify_prediction_requests)
  
  ssc.start()
  ssc.awaitTermination()
Esempio n. 30
0
    #load model
    if algoName == "LogisticRegression":
        from pyspark.ml.classification import LogisticRegressionModel
        model = LogisticRegressionModel.load(modelPath)
    elif algoName == "LinearRegression":
        from pyspark.ml.regression import LinearRegressionModel
        model = LinearRegressionModel.load(modelPath)
    elif algoName == "DecisionTreeClassification":
        from pyspark.ml.classification import DecisionTreeClassificationModel
        model = DecisionTreeClassificationModel.load(modelPath)
    elif algoName == "DecisionTreeRegression":
        from pyspark.ml.regression import DecisionTreeRegressionModel
        model = DecisionTreeRegressionModel.load(modelPath)
    elif algoName == "RandomForestClassification":
        from pyspark.ml.classification import RandomForestClassificationModel
        model = RandomForestClassificationModel.load(modelPath)
    elif algoName == "RandomForestRegression":
        from pyspark.ml.regression import RandomForestRegressionModel
        model = RandomForestRegressionModel.load(modelPath)
    elif algoName == "GBTClassification":
        from pyspark.ml.classification import GBTClassificationModel
        model = GBTClassificationModel.load(modelPath)
    elif algoName == "GBTRegression":
        from pyspark.ml.regression import GBTRegressionModel
        model = GBTRegressionModel.load(modelPath)

    #predict
    prediction = model.transform(data).select("prediction")

    #save
    prediction.write.format("csv").save(outputPath)
import sys

#Create and connect to spark session, read data given in docker command
spark = SparkSession.builder.master('local[*]').appName(
    'Predict_model').getOrCreate()
test_set = spark.read.csv(sys.argv[-1], header=True, inferSchema=True, sep=';')

# Create feature vector
assembler = VectorAssembler(inputCols=[
    test_set.columns[0], test_set.columns[1], test_set.columns[2],
    test_set.columns[3], test_set.columns[4], test_set.columns[5],
    test_set.columns[6], test_set.columns[7], test_set.columns[8],
    test_set.columns[9], test_set.columns[10]
],
                            outputCol='features')
test_assembled = assembler.transform(test_set)
test_assembled = test_assembled.select(test_assembled.columns[-1],
                                       test_assembled.columns[-2])

# Load trained classification model
rfp = RandomForestClassificationModel.load('RF_model')

#Predict classes of new data
predictions = rfp.transform(test_assembled)

#Evaluate model performance
multi_evaluator = MulticlassClassificationEvaluator(
    labelCol=test_assembled.columns[-1], metricName='f1')
print('F-1 Score of the classification model:',
      multi_evaluator.evaluate(predictions))