def RandomForestRegressor(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) model = rf.fit(df) model.featureImportances # SparseVector(1, {0: 1.0}) allclose(model.treeWeights, [1.0, 1.0]) # True test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) model.transform(test0).head().prediction # 0.0 model.numFeatures # 1 model.trees # [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] model.getNumTrees # 2 test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) model.transform(test1).head().prediction # 0.5 temp_path = "./" rfr_path = temp_path + "/rfr" rf.save(rfr_path) rf2 = RandomForestRegressor.load(rfr_path) rf2.getNumTrees() # 2 model_path = temp_path + "/rfr_model" model.save(model_path) model2 = RandomForestRegressionModel.load(model_path) model.featureImportances == model2.featureImportances
#we could pass also if reg or class or biclass and RDD or DataFrame must be found somewhere #we should have a variable indicating if is RDD or DataFrame and what type of model if IS_RDD: rfModel = RandomForestModel.load(sc, "/home/t752887/python/myModelPath/SPARK_RF_Regression_"+dataId) data = sc.textFile("/home/t752887/data/PRED_DATASET.csv") parsedFeaturesData = data.map(parseFeatures) #parsedTargetData = data.map(parseTarget) #predict them all!! print "prediction" print rfModel.predict(parsedFeaturesData).collect() else: rfPipeline = Pipeline() #rfModel = CrossValidatorModel() rfPipeline.load("/home/t752887/python/myModelPath/SPARK_RF_Regression_"+dataId+"_Pipeline") rfModel = PipelineModel.load("/home/t752887/python/myModelPath/SPARK_RF_Regression_"+dataId) rf = RandomForestRegressor.load("/home/t752887/python/myModelPath/SPARK_RF_R_"+dataId) #input should be a similar file as used for building the model with only one row per compound to predict (and of course no response) datasetDF = sqlContext.read.format('csv').options(delimiter=';', header='true',inferschema='true',nullValue='').load("/home/t752887/data/PRED_DATASET.csv") #print rf.getNumTrees() #modelText = str(rfModel.stages[-1].params) #print modelText #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText)) #print nbTrees predictions = rfModel.transform(datasetDF).select("prediction") print predictions.toPandas().to_string(index=False)