Example #1
0
 def Prediction(self, modelType):
     data_point = self.Features
     if modelType == 'RF':
         model = RandomForestModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'GBDT':
         model = GradientBoostedTreesModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'LRsgd':
         model = LogisticRegressionModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'LRlbfgs':
         model = LogisticRegressionModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'SVM':
         model = SVMModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     else:
         pass
def getPredictions(pfDF, decodePlayerIds, rddDir, sc, pitching_hitter, predictors):
    all_fd_points_df = None
    #playerIds = pfDF.map(lambda x: str(x.player_id) + '_' + x.game_id)
    playerIds = pfDF.map(lambda x: x.player_id).map(lambda x: decodePlayerIds[x])
    print "playerIds=", playerIds.collect()
    for predictor in predictors:
        print "predictor=", predictor
        #modelFilename=rddDir + "pitching_" + predictor + "_model.RandomForest"
        modelFilename = rddDir + pitching_hitter + "_" + predictor + "_model.RandomForest"
        model = GradientBoostedTreesModel.load(sc, modelFilename)
        data = toLabeledPoint(pfDF, predictor)
        #pitcherFeatures = pfDF.collect()
        predictions = model.predict(data.map(lambda x: x.features))
        print "p predictions=", predictions
        print "p predictions take=", predictions.take(16)
        labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions).cache()
        print "predictions=", labelsAndPredictions.take(16)
        #pitcherPredictions = pfDF.map(lambda x: x.asDict()['player_id']).map(lambda x: decodePlayerIds[x]).zip(predictions).cache()
        #print "pitcherPredictions=", pitcherPredictions.take(16)
        if all_fd_points_df is None:
            #all_fd_points_df = testData.map(lambda x: x.player_id).zip(predictions).toDF(['player_id', predictor]).cache()
            print "FIRST: # predictions=", predictions.count()
            print " # playerIds=", playerIds.count()
            all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor])
            print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
            print "# all_fd_points_df", all_fd_points_df.count()
            print "first all_fd_points_df", all_fd_points_df.take(5)
            print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
        else:
            print "ELSE: # predictions=", predictions.count()
            print " # playerIds=", playerIds.count()
            curr_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor])
            print "all_fd_points_df", all_fd_points_df.printSchema()
            print "PRE all_fd_points_df", all_fd_points_df.take(16)
            print "curr_fd_points_df", curr_fd_points_df.printSchema()
            print "# curr_fd_points_df", curr_fd_points_df.count()
            #print "distinct curr_fd_points_df", curr_fd_points_df.select('player_id').distinct().count()
            print "first curr", curr_fd_points_df.take(16)
            all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, all_fd_points_df.player_id == curr_fd_points_df.player_id, 'inner').drop(curr_fd_points_df.player_id)
            print "second ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
            #print "all debugstring", all_fd_points_df.rdd.toDebugString()
            print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
#    def sumFD(r):
#        x = r.asDict()
#        fd_sum = 0.0
#        for k,v in x.iteritems():
#            if k not in ['fd_points', 'player_id']:
#                fd_sum += v
#        x['fd_sum'] = fd_sum
#        x['fd_points_orig'] = x['fd_points']
#        x['fd_points'] = fd_sum
#        print "sumx=", x
#        return Row(**x)

    predictions = all_fd_points_df.map(sumFD)
    print pitching_hitter + " predictions=", predictions.take(50)
    return predictions
def getFDPointsPredictions(pfDF, decodePlayerIds, rddDir, sc, pitching_hitter, predictors):
    all_fd_points_df = None
    playerIds = pfDF.map(lambda x: x.player_id).map(lambda x: decodePlayerIds[x])
    print "playerIds=", playerIds.collect()
    predictor = 'fd_points'
    print "predictor=", predictor
    modelFilename = rddDir + pitching_hitter + "_" + predictor + "_model.RandomForest"
    model = GradientBoostedTreesModel.load(sc, modelFilename)
    data = toLabeledPoint(pfDF, predictor)
    predictions = model.predict(data.map(lambda x: x.features))
    print "p predictions=", predictions
    print "p predictions take=", predictions.take(16)
    labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions).cache()
    print "labelsAndPredictions=", labelsAndPredictions.take(16)
    print " # playerIds=", playerIds.count()
    all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor])
    print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
    print "# all_fd_points_df", all_fd_points_df.count()
    print "first all_fd_points_df", all_fd_points_df.take(5)
    print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
    print pitching_hitter + " predictions=", all_fd_points_df.take(50)
    return all_fd_points_df
Example #4
0
import flask
from flask import Flask, request, url_for, Response
from sklearn.externals import joblib
from pyspark.mllib.tree import GradientBoostedTreesModel
from pyspark.mllib.linalg import SparseVector
from pyspark import SparkContext,SparkConf
import json
app = Flask(__name__)

# 加载模型
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)

model =GradientBoostedTreesModel.load(sc,'./sellModel')

@app.route("/", methods=["GET"])
def index():
    with app.test_request_context():
        # 生成每个函数监听的url以及该url的参数
        result = {"gbdt": {"url": url_for("gbdt"),
                                   "params": ["vector"]}}

        result_body = flask.json.dumps(result)

        return Response(result_body, mimetype="application/json")

@app.route("/ml/gbdt", methods=["GET"])
def gbdt():
    request_args = request.args

    # 如果没有传入参数,返回提示信息
Example #5
0
# %%
svm_predictions = test.map(
    lambda line: (line[0], line[1], float(svm_model.predict(line[3]))))
svm_predictions.coalesce(1).toDF().write.options(header="true").csv(
    "hdfs://node1:9000/user/root/exp4/predictions/svm_predictions.csv")

# %% [markdown]
# 日期:2020-12-20 14:18:59 排名: 无
# score:0.5156678
# %% [markdown]
# ## Gradient Boosted Trees

# %%
from pyspark.mllib.tree import GradientBoostedTreesModel
GBDT_model = GradientBoostedTreesModel.load(
    sc,
    "hdfs://node1:9000/user/root/exp4/models/myGradientBoostingClassificationModel"
)

# %%
predictions = GBDT_model.predict(test.map(lambda x: x[3]))
GBDT_predictions = test.map(lambda lp: (lp[0], lp[1])).zip(predictions).map(
    lambda lp: (lp[0][0], lp[0][1], lp[1]))
GBDT_predictions.coalesce(1).toDF().write.options(header="true").csv(
    "hdfs://node1:9000/user/root/exp4/predictions/GBDT_predictions.csv")

# GBDT_predictions = test.map(lambda line: (line[0],line[1],float(GBDT_model.predict(line[3]))))
# GBDT_predictions.coalesce(1).toDF().write.options(header="true").csv("hdfs://node1:9000/user/root/exp4/predictions/GBDT_predictions.csv")

# %% [markdown]
# 日期:2020-12-20 14:51:00 排名: 无
# score:0.5000562
Example #6
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
# ONE-HOT ENCODING OF CATEGORICAL TEXT FEATURES FOR INPUT INTO TREE-BASED MODELS
def parseRowIndexingRegression(line):
    features = np.array([line.paymentIndex, line.vendorIndex, line.rateIndex, line.TrafficTimeBinsIndex, 
                         line.pickup_hour, line.weekday, line.passenger_count, line.trip_time_in_secs, 
                         line.trip_distance, line.fare_amount])
    return  features

# FOR REGRESSION CLASSIFICATION TRAINING AND TESTING
indexedTESTreg = encodedFinal.map(parseRowIndexingRegression)

# CACHE RDDS IN MEMORY
indexedTESTreg.cache();

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
####################################################################
## REGRESSION: LOAD SAVED MODEL, SCORE AND SAVE RESULTS BACK TO BLOB
savedModel = GradientBoostedTreesModel.load(sc, BoostedTreeRegressionFileLoc)
predictions = savedModel.predict(indexedTESTreg)

# SAVE RESULTS
datestamp = unicode(datetime.datetime.now()).replace(' ','').replace(':','_');
btregressionfilename = "GradientBoostingTreeRegression_" + datestamp + ".txt";
dirfilename = scoredResultDir + btregressionfilename;
predictions.saveAsTextFile(dirfilename)

# ## Cleanup objects from memory, print final time, and print scored output file locations

# #### Unpersist objects cached in memory
taxi_df_test_cleaned.unpersist()
indexedTESTreg.unpersist();
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonGradientBoostedTreesClassificationExample")
    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainClassifier(trainingData,
                                                 categoricalFeaturesInfo={}, numIterations=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingClassificationModel")
    sameModel = GradientBoostedTreesModel.load(sc,
                                               "target/tmp/myGradientBoostingClassificationModel")
    # $example off$
Example #9
0
    encodedPitchingFeatures = sqlContext.parquetFile(rddDir + "/pitching_features.enc.parquet")
    encodedPitchingFeatures.registerTempTable("pfe")
    print "pfe=", encodedPitchingFeatures.take(22)

    pfDF = sqlContext.sql("""select distinct pfe.* from fd_pitchers, pfe where
                            fd_pitchers.player_id = pfe.player_id
                            and fd_pitchers.game_date = pfe.game_date""")
    #TODO - why are we getting duplicate records?
    print "count pfdf=", pfDF.count()
    print "pfDF=", pfDF.collect()
    print "pfDF vals=", pfDF.select('player_id', 'game_id').collect()


    #model = RandomForestModel.load(sc, rddDir + "batting_model.RandomForest")
    #model = GradientBoostedTreesModel.load(sc, rddDir + "batting_model.RandomForest")
    model = GradientBoostedTreesModel.load(sc, rddDir + "batting_fd_points_model.RandomForest")
    playerAndPredictions = TrainModel.predictHitters(hfDF, decodedHitterPlayerIds, rddDir, sc)
    hitterFeatures = hfDF.collect()
#    global predictField
#    predictField = 'fd_points'
#    data = hfDF.map(toLabeledPoint)
#    predictions = model.predict(data.map(lambda x: x.features))
#    print "predictions=", predictions
#    print "predictions take=", predictions.take(2)
#    labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions).cache()
#    print "predictions=", labelsAndPredictions.take(2)
#    playerAndPredictions = hfDF.map(lambda x: x.asDict()['player_id']).map(lambda x: decodePlayerIds[x]).zip(predictions).cache()
#    print "playerAndPredictions=", playerAndPredictions.take(2)
    
    #model = RandomForestModel.load(sc, rddDir + "pitching_model.RandomForest")
    #model = GradientBoostedTreesModel.load(sc, rddDir + "pitching_model.RandomForest")
Example #10
0
conf = SparkConf().setAppName(
    'Gradient Boosted Tree Classification').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse data file
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')

# split the data into training and test
trainingData, test = data.randomSplit([0.7, 0.3])

# train a gradient boost tree model
model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo={},
                                             numIterations=3)

# evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelAndPredictions.filter(lambda (v, p): v != p).count() / float(
    test.count())
print('test error :' + str(testErr))
print('learned classification GBT model :')
print(model.toDebugString)

# save and load
model.save(sc, '../model/myGradientBoostingClassificationModel')
sameModel = GradientBoostedTreesModel.load(
    sc, '../model/myGradientBoostingClassificationModel')

sc.stop()
Example #11
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTreesModel
from pyspark.mllib.linalg import SparseVector
from pyspark import SparkContext, SparkConf
from pyspark.ml.util import MLReader
from random import random

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf=conf)

#loader = MLReader()
#model = loader.load('./model')
model = GradientBoostedTreesModel.load(sc, './gbdymodelonlionev1')

rdd = sc.parallelize([[i for i in range(16)], [i * 2000 for i in range(16)],
                      [random() for i in range(16)]])
print(model.predict(rdd).collect())

#model.predct(sparkpdfrompandas.rdd.map(lambda x:list(x)))
print(model)

#print(model.predict(SparseVector(2, {0: 1.0})))
        ascontext.setSparkOutputSchema(output_schema)
        sys.exit(0)

    modelpath = ascontext.getModelContentToPath("model")
    model_metadata = json.loads(ascontext.getModelContentToString("model.metadata"))
    model_type=model_metadata["model_type"]

# create a DataModelTools to handle data model and data conversions
datamodel = model_metadata["datamodel"]
dmt = DataModelTools(datamodel)

predictors = model_metadata["predictors"]
DataModelTools.checkPredictors(datamodel,predictors,df)

from pyspark.mllib.tree import GradientBoostedTreesModel
model = GradientBoostedTreesModel.load(sc, modelpath)

# to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this
dv = dmt.extractDenseVector(df,predictors).map(lambda x:x[1])

# scoring generates an RDD of predictions (but not the original features)
predictions = model.predict(dv)

# now we need to zip together the original rows from the DataFrame and the RDD of predictions
# we end up with an RDD containing the list of values from the original dataframe plus the predicted class, converted from the encoded number to the original string
def rowToList(row):
        result = []
        for idx in range(0, len(row)):
            result.append(row[idx])
        return result
Example #13
0
    #df = spark.read.option("header","false").csv("hdfs://student61:9000/wiki/final_Item.csv")
    df = spark.read.option("header", "false").option(
        "inferSchema", "true").csv("hdfs://student61:9000/wiki/final_Item.csv")

    #df.cast(DoubleType)
    print('parker!!!', df)
    print(df.first())
    df.show()
    test_data = df.rdd.map(
        lambda row: LabeledPoint(row[-1], Vectors.dense(row[:])))
    print('testData!!!', test_data)
    # load model
    start_time = time()
    #model = GradientBoostedTreesModel.load(spark.sparkContext,"hdfs://student61:9000/wiki/GBDT_model")
    # model = RandomForestModel.load(spark.sparkContext,"hdfs://student61:9000/wiki/RF_model")
    model = GradientBoostedTreesModel.load(
        spark.sparkContext, "hdfs://student61:9000/wiki/GBDT_regression_model")
    #print(model.toDebugString())
    end_time = time()
    elapsed_time = end_time - start_time
    print("---------------------------------------------------")
    print("Time to load model: %.3f seconds" % elapsed_time)
    print("---------------------------------------------------")
    # make predictions
    predictions = model.predict(test_data.map(lambda x: x.features))
    end_time = time()
    elapsed_time = end_time - start_time
    print("---------------------------------------------------")
    print("Time from load model to predictions: %.3f seconds" % elapsed_time)
    print("---------------------------------------------------")
    print(
        '--------------------------------------------------------------------')
Example #14
0
# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Gradient Boosted Tree Regression').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse data file
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')

# split the data into training and test
trainingData, test = data.randomSplit([0.7, 0.3])

# train a gradient boosted tree model
model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3)

# evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x : x.features))
labelsAndPredictions = test.map(lambda lp : lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p) : (v-p)**2).sum()/float(test.count())
print('test mean squared error :' + str(testMSE))
print('learned regression GBT model :')
print(model.toDebugString)

# save and load
model.save(sc, '../model/myGradientBoostingRegressionModel')
sameModel = GradientBoostedTreesModel.load(sc, '../model/myGradientBoostingRegressionModel')

sc.stop()
print(str(datetime.now())+": Initiating SparkContext...")

# Initiate Spark Context
try:
    sc = SparkContext("local","dengue")
    sqlContext = SQLContext(sc)
except:
    print(str(datetime.now())+": Failed to initiate Spark Context!")
    print(str(datetime.now())+": Quitting...")
    sys.exit()

print(str(datetime.now())+": Loading trained ML model from HDFS...")

# Load trained model from HDFS
try:
    ml_model = GradientBoostedTreesModel.load(sc,"hdfs:///user/w205/dengue_prediction/ml_model")
except:
    print(str(datetime.now())+": Unable to load trained model from HDFS!")
    print(str(datetime.now())+": Quitting...")
    sys.exit()

print(str(datetime.now())+": Testing database connection...")

try:
    # Connect to the database
    conn = psycopg2.connect(database="denguepred", user="******", password="******", host="localhost", port="5432")
    # Create cursor
    cur = conn.cursor()
    # Execute a query just to check that we don't get an exception
    cur.execute("SELECT * from predictions LIMIT 1;")
    # Try to fetch the result
Example #16
0
        line.passenger_count, line.trip_time_in_secs, line.trip_distance,
        line.fare_amount
    ])
    return features


# FOR REGRESSION CLASSIFICATION TRAINING AND TESTING
indexedTESTreg = encodedFinal.map(parseRowIndexingRegression)

# CACHE RDDS IN MEMORY
indexedTESTreg.cache()

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
####################################################################
## REGRESSION: LOAD SAVED MODEL, SCORE AND SAVE RESULTS BACK TO BLOB
savedModel = GradientBoostedTreesModel.load(sc, BoostedTreeRegressionFileLoc)
predictions = savedModel.predict(indexedTESTreg)

# SAVE RESULTS
datestamp = unicode(datetime.datetime.now()).replace(' ',
                                                     '').replace(':', '_')
btregressionfilename = "GradientBoostingTreeRegression_" + datestamp + ".txt"
dirfilename = scoredResultDir + btregressionfilename
predictions.saveAsTextFile(dirfilename)

# ## Cleanup objects from memory, print final time, and print scored output file locations

# #### Unpersist objects cached in memory
taxi_df_test_cleaned.unpersist()
indexedTESTreg.unpersist()
def load(name):
    model = init({}, {})
    model['model'] = GradientBoostedTreesModel.load(
        model['spark'].sparkContext, MODEL_DIRECTORY + name)
    return model
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample")
    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={}, numIterations=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
    sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingRegressionModel")
    # $example off$
 def load(self, path):
     return GradientBoostedTreesModel.load(sc, path)
Example #20
0
if __name__ == "__main__":
    sc = SparkContext(
        appName="PythonGradientBoostedTreesClassificationExample")
    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainClassifier(trainingData,
                                                 categoricalFeaturesInfo={},
                                                 numIterations=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingClassificationModel")
    sameModel = GradientBoostedTreesModel.load(
        sc, "target/tmp/myGradientBoostingClassificationModel")
    # $example off$
if __name__ == "__main__":
    sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample")
    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={},
                                                numIterations=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
    sameModel = GradientBoostedTreesModel.load(
        sc, "target/tmp/myGradientBoostingRegressionModel")
    # $example off$
Example #22
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
    print(dataPath)

    data = MLUtils.loadLibSVMFile(sc, dataPath)
    # 将数据集分割为训练数据集和测试数据集
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    print("train data count: " + str(trainingData.count()))
    print("test data count : " + str(testData.count()))

    # 训练GBDT分类器
    # categoricalFeaturesInfo 为空,表示所有的特征均为连续值
    # 实践中使用更多的numIterations
    model = GradientBoostedTrees.trainClassifier(trainingData,
                                                 categoricalFeaturesInfo={},
                                                 numIterations=3)

    # 测试数据集上预测
    predictions = model.predict(testData.map(lambda x: x.features))
    # 打包真实值与预测值
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    # 统计预测错误的样本的频率
    testErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(testData.count())
    print('GradientBoosted Trees Test Error = %5.3f%%' % (testErr * 100))
    print("GradientBoosted Trees Learned classifiction tree model : ")
    print(model.toDebugString())

    # 保存和加载训练好的模型
    modelPath = "/home/zhb/Desktop/work/DecisionTreeShareProject/app/myGradientBoostingClassificationModel"
    model.save(sc, modelPath)
    sameModel = GradientBoostedTreesModel.load(sc, modelPath)