def __prediction(self, df_pred): logger.info( "__prediction: LOADING PIPELINE ##################################### " ) gbt_pipeline = Pipeline.load('budget_prediction_pipeline/') gbt_pipeline_loaded = gbt_pipeline.fit(df_pred) ddf_features_df = gbt_pipeline_loaded.transform(df_pred) logger.info( "__prediction: FILTERING DATA ##################################### " ) ddf_features_df = ddf_features_df.filter("idmovie in(99999)") logger.info( "__prediction: LOADING MODEL ##################################### " ) gbt_model_load = GBTRegressionModel.load('gbt_model_old/') gbt_model_pred = gbt_model_load.transform(ddf_features_df) gbt_model_pred.selectExpr( 'idmovie', 'director', 'genres', 'runtime', 'cast(prediction as Decimal(38,2)) as prediction').show( truncate=False) logger.info( "__prediction: DATA PREDICTED ##################################### " ) return gbt_model_pred.selectExpr('director', 'genres', 'runtime', 'prediction')
def loadModels(path,typeofmodel): models = {} for park in park_data_with_date_dict: if typeofmodel == "linear": models[park] = LinearRegressionModel.load(path+str(park)) elif typeofmodel == "tree": models[park] = DecisionTreeRegressionModel.load(path+str(park)) elif typeofmodel == "gbt": models[park] = GBTRegressionModel.load(path+str(park)) return models
def predict(): spark = SparkSession.builder.appName('airbnb_price').getOrCreate() if request.method == 'POST': raw_data = spark.read.json(request.json) model = GBTRegressionModel.load(MODEL_PATH) data = data_processing(raw_data) gbt_predictions = model.transform(data) output = gbt_predictions.select('prediction') json_output = output.toJSON() return jsonify({'prediction': json_output})
def predict(self): """ :return: """ cols = [x for x in self.data.columns if x not in ['datetime', 'label']] assembler = VectorAssembler(handleInvalid="keep").setInputCols \ (cols).setOutputCol("features") print('assembler') test = assembler.transform(self.data) test = test.drop(*cols) rf = GBTRegressionModel.load('myGBTRegressor_nan') preds = rf.transform(test) print(preds.printSchema()) preds.write.save("regression_preds_5.parquet") return preds
def GBT_regressor(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) print(gbt.getImpurity()) # variance model = gbt.fit(df) model.featureImportances # SparseVector(1, {0: 1.0}) model.numFeatures # 1 allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) # True test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) model.transform(test0).head().prediction # 0.0 test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) model.transform(test1).head().prediction # 1.0 temp_path = "./" gbtr_path = temp_path + "gbtr" gbt.save(gbtr_path) gbt2 = GBTRegressor.load(gbtr_path) gbt2.getMaxDepth() # 2 model_path = temp_path + "gbtr_model" model.save(model_path) model2 = GBTRegressionModel.load(model_path) model.featureImportances == model2.featureImportances # True model.treeWeights == model2.treeWeights # True model.trees
def loadModel(self): if self.algoName == "linear_reg" or self.algoName == \ "ridge_reg" or self.algoName == "lasso_reg" : regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation) if self.algoName == "RandomForestAlgo" : regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation) if self.algoName == "GradientBoostAlgo": regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation) #dropping the already existed column of prediction on same model self.dataset = self.dataset.drop(self.modelSheetName) predictionData = regressionPrediction.transform(self.dataset) predictionData = predictionData.drop(self.featuresColm) #dropping extra added column if self.indexedFeatures: self.indexedFeatures.extend(self.oneHotEncodedFeaturesList) predictionData = predictionData.drop(*self.indexedFeatures) else: predictionData = predictionData #overWriting the original dataset '''this step is needed to write because of the nature of spark to not read or write whole data at once it only takes limited data to memory and another problem was lazy evaluation of spark. so overwriting the same dataset which is already in the memory is not possible''' emptyUserId = '' fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet" predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite") predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp) predictionTableData = \ PredictiveUtilities.writeToParquet(fileName=self.datasetName, locationAddress=self.locationAddress, userId=emptyUserId, data=predictionDataReadAgain) return predictionTableData
def predict(features_tab, tab_out, model_path, veh): # 1 配置 spark = SparkSession \ .builder \ .master("yarn") \ .appName("tianzw_vol_fading_predict_second_versions") \ .config("spark.sql.warehouse.dir","hdfs://neicluster/user/hive/warehouse") \ .enableHiveSupport() \ .getOrCreate() spark.sparkContext.setLogLevel("ERROR") # 2 准备数据 sql = """ SELECT vin , sta_time , mils_1000km , sta_soc , charge_c , hours , temp , days , mils_dif , cnt_cha , vol_cha , vol_avg_cha , hou_cha , c_avg , sta_soc_avg_cha , end_soc_avg_cha , dep_soc_avg_cha , sta_soc_mid_cha , end_soc_mid_cha , dep_soc_mid_cha , cnt_tem , tem_mid_yea , tem_avg_yea , tem_dif_yea , tem_var_yea FROM """ + features_tab + """ WHERE veh_head = SUBSTR('""" + veh + """',0,1) AND veh = '""" + veh + """' """ rdd_origin = spark.sql(sql).rdd features_rdd = rdd_origin.map(lambda x: ( x.vin, x.sta_time, Vectors.dense([ x.mils_1000km, x.sta_soc, x.charge_c, x.hours, x.temp, x.days, x. mils_dif, x.cnt_cha, x.vol_cha, x.vol_avg_cha, x.hou_cha, x.c_avg, x.sta_soc_avg_cha, x.end_soc_avg_cha, x.dep_soc_avg_cha, x. sta_soc_mid_cha, x.end_soc_mid_cha, x.dep_soc_mid_cha, x.cnt_tem, x .tem_mid_yea, x.tem_avg_yea, x.tem_dif_yea, x.tem_var_yea ]), )) features_list = features_rdd.collect() print("数据提取成功") spark_df = spark.createDataFrame(features_list, ["vin", "sta_time", "features"]) # 3 模型预测 # model = GBTRegressor.load(model_path) model = GBTRegressionModel.load(model_path) print("模型导入成功") predictions = model.transform(spark_df) print("计算成功") new_list = [(x.vin, x.sta_time, x.prediction) for x in predictions.collect()] result_df = spark.createDataFrame(new_list, ["vin", "sta_time", "vol_fading"]) result_df = result_df.repartition(1) result_df.createOrReplaceTempView("table_temp") # 数据写入 hive 表 # createSQL = """ # CREATE EXTERNAL TABLE IF NOT EXISTS """ + tab_out + """ # ( # vin STRING COMMENT '车架号', # sta_time BIGINT COMMENT '充电开始时间(s)', # vol_fading DOUBLE COMMENT '容量衰减百分比预测值(2位小数)' # ) # PARTITIONED BY(veh STRING COMMENT '车型名') # ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' # """ insertSql = """ INSERT OVERWRITE TABLE """ + tab_out + """ PARTITION(veh = '""" + veh + """') SELECT vin, sta_time, ROUND(vol_fading,2) AS vol_fading FROM table_temp """ # spark.sql("DROP TABLE IF EXISTS " + tab_out) # spark.sql(createSQL) spark.sql(insertSql) print(tab_out + "写入成功")
def __load_from_hdfs(self): mdl = GBTRegressionModel.load(self.hdfs_uri) print("GBT() - model loaded from uri {}".format(self.hdfs_uri)) return mdl
StructField("EDUCATION", StringType(), True), StructField("OCCUPATION", StringType(), True), StructField("TRAVTIME", StringType(), True), StructField("BLUEBOOK", StringType(), True), StructField("TIF", StringType(), True), StructField("CAR_TYPE", StringType(), True), StructField("OLDCLAIM", StringType(), True), StructField("CLM_FREQ", StringType(), True), StructField("MVR_PTS", StringType(), True), StructField("CAR_AGE", StringType(), True) ]) print("Loading Models....") PModel = PipelineModel.load(PipelineLoc) LRModel = LogisticRegressionModel.load(CatModelLoc) GBTModel = GBTRegressionModel.load(IntModelLoc) if KafkaBserver == "<ENTER-KAFKAZBROKER-HERE>": print("Update Kafka server in the file") exit() raw_records = spark.readStream.format("kafka") \ .option("kafka.bootstrap.servers", KafkaBserver)\ .option("subscribe", "NewUser") \ .option("startingOffsets", "latest") \ .load() NestedJsonDf = raw_records.select( f.col("key").cast("string"), f.from_json(f.col("value").cast("string"), schema).alias("UserRecord")) FlatDf = NestedJsonDf.selectExpr(
print(output.count()) test_features = output.na.drop() print(test_features.count()) # test_output = assembler.transform(test) # print(test_output.count()) # train_output = test_output.na.drop() # print(test_output.count()) # print("Assembled columns 'hour', 'minute' .. to vector column 'features'") # test_output.show(truncate=False)#.select("features", "clicked") from pyspark.ml.regression import GBTRegressor, GBTRegressionModel # gbt = GBTRegressor(featuresCol="features", maxIter=10) path = "bike_sharing_gbt_file.model" gbt_model = GBTRegressionModel.load(path) # Make predictions. print("Before model creation") predictions = gbt_model.transform(test_features) print("After model creation") predictions.printSchema() predictions.show() # gbt_model.write().overwrite().save(path) # Select example rows to display. from pyspark.sql.functions import col, lit, concat bs_df.show() # predictions = predictions.withColumn("datetime", bs_df.select("datetime")) predictions = predictions.withColumn( "datetime", concat(col("year"), lit("-"), col("month"), lit("-"), col("day"),
def main(): #静默弃用sklearn警告 warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) model_name = 'Distr_GBTRegressor' dir_of_dict = sys.argv[1] bag = too.Read_info(dir_of_dict, 'supervision') name_dict,options,task_id,job_id,train_result_dir,\ names_str,names_num,names_show,Y_names,dir_of_inputdata,\ dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag dir_of_storePara = train_result_dir + '/%s_Parameters.json' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) dir_of_storeModel = train_result_dir + '/%s_model' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) # 配置spark客户端 sess = SparkSession\ .builder\ .master("local[4]")\ .appName("GBTRegressor_spark")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() sc = sess.sparkContext sc.setLogLevel("ERROR") if options == 'train': time_start = time() #获取数据 dataset = pd.read_csv(dir_of_inputdata) #用于测试 #dataset = dataset[0:1000] Y_datavec = dataset[Y_names].values #分别获得字符字段和数值型字段数据,且合并 X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form( dataset, names_str, names_num, names_show, 'vocabset', 'open') #数据归一化 X_datavec = too.Data_process(X_datavec, normalized_type) #处理数据不平衡问题 #X,Y = mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names) #X,Y = mlp.Sample_unbalanced(X_datavec,Y_datavec) X, Y = X_datavec, Y_datavec ret_num = 'no_num' #PCA降维 if open_pca == 'open_pca': pca_num, ret = mlp.GS_PCA(X) print 'PCA Information:', pca_num, ret print '----------------------------------------------' ret_num = ret['99%'] X = mlp.Model_PCA(X, ret_num) #存储vocabset这个list和ret_num too.StorePara(dir_of_storePara, vocabset, ret_num) print '--------------Train data shape----------------' print 'X.shape:', X.shape print '----------------------------------------------' print 'Y.shape:', Y.shape print '----------------------------------------------' print '--------------Start %s model------------------' % model_name features = pd.DataFrame(X, ) targets = pd.DataFrame(Y, columns=['Y']) #合拼矩阵 merged = pd.concat([features, targets], axis=1) #创建spark DataFrame raw_df = sess.createDataFrame(merged) #提取特征与目标 fomula = RFormula(formula='Y ~ .', featuresCol="features", labelCol="label") raw_df = fomula.fit(raw_df).transform(raw_df) #拆分训练集和测试集 xy_train, xy_test = raw_df.randomSplit([train_size, test_size], seed=666) #调用模型 clf_model = dmp.Distr_GBTRegressor(xy_train, xy_test) #保存模型参数 clf_model.write().overwrite().save(dir_of_storeModel) print '----------------------------------------------' dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata, 'reg') duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration if options == 'predict': time_start = time() with open(dir_of_storePara, 'r') as f: para_dict = json.load(f) vocabset = para_dict['vocabset'] ret_num = para_dict['ret_num'] #获取数据 dataset = pd.read_csv(dir_of_inputdata) #分别获得字符字段和数值型字段数据,且合并 X_datavec, datavec_show_list = too.Merge_form(dataset, names_str, names_num, names_show, vocabset, 'close') #数据归一化 X = too.Data_process(X_datavec, normalized_type) #PCA降维 if open_pca == 'open_pca': X = mlp.Model_PCA(X, ret_num) print '-------------Pdedict data shape---------------' print 'X.shape:', X.shape print '----------------------------------------------' print '--------------Start %s model------------------' % model_name features = pd.DataFrame(X, ) #创建spark DataFrame raw_features = sess.createDataFrame(features) raw_x = VectorAssembler(inputCols=raw_features.columns, outputCol='features').transform(raw_features) clf_model = GBTRegressionModel.load(dir_of_storeModel) dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata, 'reg') duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration
def load_model(path, classifier='gbt'): if classifier == 'gbt': return GBTRegressionModel.load(path) # for gbt return Pipeline.load(path) # for random forest
NoScale_Pca = PCAModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/NoScale_Pca.model') Scaled_Pca = PCAModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/Scaled_Pca.model') NoScale_Pca = NoScale_Pca.transform(vector_vehicle_df).select( ["og_features", "features"]) Scaled_Pca = Scaled_Pca.transform(scaledData).select( ["og_features", "features"]) #Loading models lr_model = LinearRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/lr_model.model') dtr_model = DecisionTreeRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/dtr_model.model') gbt_model = GBTRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/gbt_model.model') rf_model = RandomForestRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/rfr_model.model') #Generate prediction lr_pred = lr_model.transform(NoScale_Pca).select( 'prediction').collect()[0]['prediction'] dtr_pred = dtr_model.transform(Scaled_Pca).select( 'prediction').collect()[0]['prediction'] gbt_pred = gbt_model.transform(Scaled_Pca).select( 'prediction').collect()[0]['prediction'] rfr_pred = rf_model.transform(NoScale_Pca).select( 'prediction').collect()[0]['prediction'] #Prepare output df to output predictions output_df = pd.DataFrame() output_df['Algorithm'] = [
def prediction(self, predictiveData): '''creating duplicate dataset to avoid the datatype change of the original dataset ''' datasetAdd = predictiveData.get(PredictiveConstants.DATASETADD) spark = predictiveData.get(PredictiveConstants.SPARK) dataset = spark.read.parquet(datasetAdd) # adding extra index column in the dataset dataset = PredictiveUtilities.addInternalId(dataset) predictiveData.update({ PredictiveConstants.DATASET: dataset }) etlStats = PredictiveUtilities.performETL(etlInfo=predictiveData) dataset = etlStats.get(PredictiveConstants.DATASET) originalDataset = etlStats.get(PredictiveConstants.ORIGINALDATASET) algoName = predictiveData.get(PredictiveConstants.ALGORITHMNAME) modelStorageLocation = predictiveData.get(PredictiveConstants.MODELSTORAGELOCATION) modelName = predictiveData.get(PredictiveConstants.MODELSHEETNAME) datasetName = predictiveData.get(PredictiveConstants.DATASETNAME) spark = predictiveData.get(PredictiveConstants.SPARK) locationAddress = predictiveData.get(PredictiveConstants.LOCATIONADDRESS) if PredictiveConstants.LINEAR_REG.__eq__(algoName) or \ PredictiveConstants.RIDGE_REG.__eq__(algoName) or PredictiveConstants.LASSO_REG.__eq__(algoName): regressionPrediction = LinearRegressionModel.load(modelStorageLocation) if PredictiveConstants.RANDOMFORESTALGO.__eq__(algoName): regressionPrediction = RandomForestRegressionModel.load(modelStorageLocation) if PredictiveConstants.GRADIENTBOOSTALGO.__eq__(algoName): regressionPrediction = GBTRegressionModel.load(modelStorageLocation) dataset = dataset.drop(modelName) originalDataset = originalDataset.drop(modelName) dataset = regressionPrediction.transform(dataset) dataset = dataset.select(PredictiveConstants.DMXINDEX, modelName) finalDataset = originalDataset.join(dataset, on=[PredictiveConstants.DMXINDEX]) \ .sort(PredictiveConstants.DMXINDEX).drop(PredictiveConstants.DMXINDEX) # predictionData = predictionData.drop(featuresColm) # # #dropping extra added column # if indexedFeatures: # indexedFeatures.extend(oneHotEncodedFeaturesList) # predictionData = predictionData.drop(*indexedFeatures) # else: # predictionData = predictionData # overWriting the original dataset '''this step is needed to write because of the nature of spark to not read or write whole data at once it only takes limited data to memory and another problem was lazy evaluation of spark. so overwriting the same dataset which is already in the memory is not possible''' emptyUserId = '' randomUUID = str(uuid.uuid4()) fileNameWithPathTemp = locationAddress + randomUUID + datasetName + "_temp.parquet" #correct the name. finalDataset.write.parquet(fileNameWithPathTemp, mode="overwrite") # send this path to java for deletion predictionDataReadAgain = spark.read.parquet(fileNameWithPathTemp) predictionTableData = \ PredictiveUtilities.writeToParquet(fileName=datasetName, locationAddress=locationAddress, userId=emptyUserId, data=predictionDataReadAgain) return predictionTableData
def load_json_and_predict(spark, sqlContext, json_file): # Load data to predict #predict_df = spark.read.json(JSON_DATA_TO_PREDICT) print("Loading prediction data from ", json_file) predict_df = spark.read.json(json_file) print("Done") # Apply same process as historical data to convert/map # Drop rows with NA columns print("Preprocessing...") predict_df_1 = predict_df.dropna() predict_df_1 = predict_df_1[ (predict_df_1.subtotal > 0) & (predict_df_1.min_item_price > 0) & (predict_df_1.max_item_price > 0) & (predict_df_1.total_onshift_runners >= 0) & (predict_df_1.total_busy_runners >= 0) & (predict_df_1.total_outstanding_orders >= 0) & (predict_df_1.estimated_order_place_duration > 0) & (predict_df_1.estimated_store_to_consumer_driving_duration > 0) & (predict_df_1.market_id != "NA") & (predict_df_1.store_primary_category != "NA") & (predict_df_1.order_protocol != "NA")] udf_rdd_datetimesec_to_sec = fn.udf( rdd_datetimesec_to_sec, IntegerType()) # LongType() not available for now predict_df_1 = predict_df_1.withColumn( 'created_at', udf_rdd_datetimesec_to_sec(fn.col('created_at'))) # Map store_id string to unique number stringindexer = StringIndexer().setInputCol("store_id").setOutputCol( "store_id_int") modelc = stringindexer.fit(predict_df_1) predict_df_1 = modelc.transform(predict_df_1) # Map store_primary_category to unique number stringindexer = StringIndexer().setInputCol( "store_primary_category").setOutputCol("store_primary_category_int") modelc = stringindexer.fit(predict_df_1) predict_df_1 = modelc.transform(predict_df_1) predict_df_1 = predict_df_1.withColumn( "market_id", predict_df_1["market_id"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "order_protocol", predict_df_1["order_protocol"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_onshift_runners", predict_df_1["total_onshift_runners"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_busy_runners", predict_df_1["total_busy_runners"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_outstanding_orders", predict_df_1["total_outstanding_orders"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "estimated_store_to_consumer_driving_duration", predict_df_1["estimated_store_to_consumer_driving_duration"].cast( IntegerType())) predict_df_1 = predict_df_1.withColumn( "subtotal", predict_df_1["subtotal"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "num_distinct_items", predict_df_1["num_distinct_items"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "estimated_order_place_duration", predict_df_1["estimated_order_place_duration"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_items", predict_df_1["total_items"].cast(IntegerType())) print("Done") # Use same features as in historical data # Other columns in test data ('store_id', 'store_primary_category', 'min_item_price', 'max_item_price') # will be dropped by VectorAssembler transformation print("Vectorize...") pvectorAssembler = VectorAssembler(inputCols=feature_list, outputCol='features') vectorized_predict_df = pvectorAssembler.transform(predict_df_1) vectorized_predict_df = vectorized_predict_df.select(['features']) print("Done...") txt_file = open(MODEL_NAME_FILE, "r") model_name = txt_file.read() print("Read model: ", model_name) txt_file.close() print("Loading model " + model_name + " from " + MODEL_DIR) if (model_name == DT_MODEL): predict_model = DecisionTreeRegressionModel.load(MODEL_DIR) if (model_name == GBT_MODEL): predict_model = GBTRegressionModel.load(MODEL_DIR) if (model_name == LR_MODEL): predict_model = LinearRegressionModel.load(MODEL_DIR) if (model_name == RF_MODEL): predict_model = RandomForestRegressionModel.load(MODEL_DIR) print("Done") print("Predicting...") model_predictions = predict_model.transform(vectorized_predict_df) print("Done") df1 = predict_df_1.select('delivery_id').withColumn( "id", monotonically_increasing_id()) df2 = model_predictions.select('prediction').withColumnRenamed( 'prediction', 'predicted_delivery_seconds').withColumn("id", monotonically_increasing_id()) # Perform a join on the ids. prediction_results_df = df1.join(df2, "id", "left").drop("id") prediction_results_df = prediction_results_df.withColumn( "predicted_delivery_seconds", prediction_results_df["predicted_delivery_seconds"].cast( IntegerType())) return prediction_results_df
#load model if algoName == "LogisticRegression": from pyspark.ml.classification import LogisticRegressionModel model = LogisticRegressionModel.load(modelPath) elif algoName == "LinearRegression": from pyspark.ml.regression import LinearRegressionModel model = LinearRegressionModel.load(modelPath) elif algoName == "DecisionTreeClassification": from pyspark.ml.classification import DecisionTreeClassificationModel model = DecisionTreeClassificationModel.load(modelPath) elif algoName == "DecisionTreeRegression": from pyspark.ml.regression import DecisionTreeRegressionModel model = DecisionTreeRegressionModel.load(modelPath) elif algoName == "RandomForestClassification": from pyspark.ml.classification import RandomForestClassificationModel model = RandomForestClassificationModel.load(modelPath) elif algoName == "RandomForestRegression": from pyspark.ml.regression import RandomForestRegressionModel model = RandomForestRegressionModel.load(modelPath) elif algoName == "GBTClassification": from pyspark.ml.classification import GBTClassificationModel model = GBTClassificationModel.load(modelPath) elif algoName == "GBTRegression": from pyspark.ml.regression import GBTRegressionModel model = GBTRegressionModel.load(modelPath) #predict prediction = model.transform(data).select("prediction") #save prediction.write.format("csv").save(outputPath)
"jar_files/commons-pool2-2.6.2.jar") \ .getOrCreate() # Set number of output partitions spark.conf.set("spark.sql.shuffle.partitions", 5) # Set log level spark.sparkContext.setLogLevel("ERROR") target_cols = norm_params['target'] models = {} for target in target_cols: model = [model for model in model_params if target in model] models[target] = GBTRegressionModel.load(model[0]) else: raise TypeError("Unrecognized model") if not isinstance(model_params, list): if model_params.endswith('.pt') or 'sklearn' in model_params: for message in consumer: df = pd.read_json(message.value) timeAtServer = float(df.timeAtServer) aircraft = int(df.aircraft)