コード例 #1
0
]
for i in numericCols:
    df = df.withColumn(i, df[i].cast(FloatType()))
df = df.withColumn("label", df['isFraud'].cast(IntegerType()))
assemblerInputs = [categoricalCol + "Vec"] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

#pipeline creation
pipeline = Pipeline(stages=stages)

pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)

#Loading Model which were saved in exc 2
gbtModel = GBTClassifier.load("gbtModel.model")

#Predicting Value
predictions = gbtModel.transform(test)

# File location and type
file_location = "dataset.csv"
file_type = "csv"

# CSV options
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type).option("header", first_row_is_header).option(
    "sep", delimiter).load(file_location)
コード例 #2
0
data = data.selectExpr(
    *["`{}` as {}".format(col, col.replace('.', '_')) for col in data.columns])

# with open('prediction_log.txt', 'w') as logFile:
# 	logFile.write('testeeeee')

# process
from pyspark.ml import Pipeline, PipelineModel
pipelineModel = PipelineModel.read().load('data_precossing_bank_mkt')

data_model = pipelineModel.transform(data)
data_model = data_model.select(["label", "features"])

# predict
from pyspark.ml.classification import GBTClassifier
gbtModel = GBTClassifier.load("modelo_bank_mkt")

predictions_gbt = gbtModel.transform(data_model)

#evaluate
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

evaluator_auc = BinaryClassificationEvaluator(labelCol="label",
                                              rawPredictionCol="rawPrediction")

accuracy_gbt = evaluator_accuracy.evaluate(predictions_gbt)
print(f'Accuracy:         {accuracy_gbt:.4f}')
auc_gbt = evaluator_auc.evaluate(predictions_gbt)