Exemple #1
0
def RandomForestRegressor():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
    model = rf.fit(df)
    model.featureImportances
    # SparseVector(1, {0: 1.0})
    allclose(model.treeWeights, [1.0, 1.0])
    # True
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    model.transform(test0).head().prediction
    # 0.0
    model.numFeatures
    # 1
    model.trees
    # [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...]
    model.getNumTrees
    # 2
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    model.transform(test1).head().prediction
    # 0.5
    temp_path = "./"
    rfr_path = temp_path + "/rfr"
    rf.save(rfr_path)
    rf2 = RandomForestRegressor.load(rfr_path)
    rf2.getNumTrees()
    # 2
    model_path = temp_path + "/rfr_model"
    model.save(model_path)
    model2 = RandomForestRegressionModel.load(model_path)
    model.featureImportances == model2.featureImportances
Exemple #2
0
#we could pass also if reg or class or biclass and RDD or DataFrame must be found somewhere

#we should have a variable indicating if is RDD or DataFrame and what type of model
if IS_RDD:  
  rfModel = RandomForestModel.load(sc, "/home/t752887/python/myModelPath/SPARK_RF_Regression_"+dataId)
  data = sc.textFile("/home/t752887/data/PRED_DATASET.csv")
  parsedFeaturesData = data.map(parseFeatures)  
  #parsedTargetData = data.map(parseTarget)  
  #predict them all!!
  print "prediction"
  print rfModel.predict(parsedFeaturesData).collect()  

else:  
  rfPipeline = Pipeline()
  #rfModel = CrossValidatorModel()  
  rfPipeline.load("/home/t752887/python/myModelPath/SPARK_RF_Regression_"+dataId+"_Pipeline")
  rfModel = PipelineModel.load("/home/t752887/python/myModelPath/SPARK_RF_Regression_"+dataId)
  rf = RandomForestRegressor.load("/home/t752887/python/myModelPath/SPARK_RF_R_"+dataId)
  #input should be a similar file as used for building the model with only one row per compound to predict (and of course no response)
  datasetDF = sqlContext.read.format('csv').options(delimiter=';', header='true',inferschema='true',nullValue='').load("/home/t752887/data/PRED_DATASET.csv")
  
  #print rf.getNumTrees()
  #modelText = str(rfModel.stages[-1].params)
  #print modelText
  #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText)) 
  #print nbTrees
  
  predictions = rfModel.transform(datasetDF).select("prediction")
  
  print predictions.toPandas().to_string(index=False)