] for i in numericCols: df = df.withColumn(i, df[i].cast(FloatType())) df = df.withColumn("label", df['isFraud'].cast(IntegerType())) assemblerInputs = [categoricalCol + "Vec"] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] #pipeline creation pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df) df = pipelineModel.transform(df) #Loading Model which were saved in exc 2 gbtModel = GBTClassifier.load("gbtModel.model") #Predicting Value predictions = gbtModel.transform(test) # File location and type file_location = "dataset.csv" file_type = "csv" # CSV options first_row_is_header = "True" delimiter = "," # The applied options are for CSV files. For other file types, these will be ignored. df = spark.read.format(file_type).option("header", first_row_is_header).option( "sep", delimiter).load(file_location)
data = data.selectExpr( *["`{}` as {}".format(col, col.replace('.', '_')) for col in data.columns]) # with open('prediction_log.txt', 'w') as logFile: # logFile.write('testeeeee') # process from pyspark.ml import Pipeline, PipelineModel pipelineModel = PipelineModel.read().load('data_precossing_bank_mkt') data_model = pipelineModel.transform(data) data_model = data_model.select(["label", "features"]) # predict from pyspark.ml.classification import GBTClassifier gbtModel = GBTClassifier.load("modelo_bank_mkt") predictions_gbt = gbtModel.transform(data_model) #evaluate from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator evaluator_accuracy = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction") accuracy_gbt = evaluator_accuracy.evaluate(predictions_gbt) print(f'Accuracy: {accuracy_gbt:.4f}') auc_gbt = evaluator_auc.evaluate(predictions_gbt)