def cross_validation_gb(Data_1,Data_2,Data_3,loss_type, num_iter, maxDepth): # Training the model using Gradient Boosted Trees regressor model_train_1 = GradientBoostedTrees.trainRegressor(Data_1.union(Data_2), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features)) labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1) testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_3.count()) model_train_2 = GradientBoostedTrees.trainRegressor(Data_2.union(Data_3), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features)) labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2) testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_1.count()) model_train_3 = GradientBoostedTrees.trainRegressor(Data_3.union(Data_1), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features)) labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3) testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_2.count()) return (testMSE_1+testMSE_2+testMSE_3)/3
def run_GBDT(input_file,output_file,iterations): dataRDD=sc.textFile(input_file).map(lambda x: x.replace('\t',',')) #Now let us create labeled point from data dataRDDParsed=dataRDD.map(parsePoint).cache() featSet=dataRDDParsed.flatMap(lambda x: x).map(maaro).reduceByKey(lambda a,b: a+b).takeOrdered(26,lambda (k,v): -v) #reduceByKey(lambda x,y:x+y).takeOrdered(25,lambda (k,v):-v) #print featSet #OHEdict=createOneHotDict(dataRDDParsed,featSet) OHEdict={} for i,x in enumerate(featSet): # print i,x OHEdict[x[0]]=i #print oneHotEncoding(dataRDDParsed,OHEdict,numSampleOHEFeats,) #Now let us create a dictionary of points # weights=[.8,.1,.1] # seed=42 # trainRDD,validateRDD,testRDD=dataRDD.randomSplit(weights,seed) # OHETrainData = trainRDD.map(lambda point: parseOHEPoint(point, OHEdict, 39)) OHETrainData = dataRDD.map(lambda point: parseOHEPoint(point, OHEdict, 39)) # print OHETrainData.take(1) # print OHETrainData.count() model = (GradientBoostedTrees.trainClassifier(OHETrainData, loss = 'logLoss', numIterations=2, categoricalFeaturesInfo={}, learningRate = 0.1, maxDepth = 7, maxBins = 2)) sc.parallelize([model.toDebugString()]).coalesce(1).saveAsTextFile(output_file)
def train(self): neg_df = spark.read.format( 'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat' ).option('header', 'true').load('neg.csv') pos_df = spark.read.format( 'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat' ).option('header', 'true').load('pos.csv') test_pos_df = spark.read.format( 'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat' ).option('header', 'true').load('ptest.csv') test_neg_df = spark.read.format( 'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat' ).option('header', 'true').load('ntest.csv') training_df = neg_df.union(pos_df) test_df = test_neg_df.union(test_pos_df) labelpointRdd = training_df.rdd.map(featureExtraction).map( lambda x: LabeledPoint(x[0], x[1:])).cache() TestlabelpointRdd = test_df.rdd.map(featureExtraction).map( lambda x: LabeledPoint(x[0], x[1:])).cache() GBTmodel = GradientBoostedTrees.trainClassifier( labelpointRdd, categoricalFeaturesInfo={}, numIterations=75) predictions = GBTmodel.predict( TestlabelpointRdd.map(lambda x: x.features)) labelsAndPredictions = TestlabelpointRdd.map(lambda lp: lp.label).zip( predictions) # save model GBTmodel.save(sc, '.') return score(labelsAndPredictions)
def train_model(cls, trianData, cateFeaInfo={}, iterTimes=3): """ 训练模型 """ model = GradientBoostedTrees.trainClassifier(trianData, \ categoricalFeaturesInfo=cateFeaInfo, numIterations=iterTimes) return model
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Creating a list of features to be used for predictions removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features") transformed_final= assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def main(sc, sql_context, is_hive=True): lp_train = MLUtils.loadLabeledPoints(sc, "bintrade.ml.diff.label_point.train") lp_check = MLUtils.loadLabeledPoints(sc, "bintrade.ml.diff.label_point.check") model = GradientBoostedTrees.trainRegressor(lp_train, {}, numIterations=50, maxDepth=10) preds = model.predict(lp_check.map(lambda x: x.features)) labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy( lambda x: x[1], ascending=False) for each in labels_and_preds.take(100): print each labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy( lambda x: x[1], ascending=True) for each in labels_and_preds.take(100): print each mse = labels_and_preds.map( lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count() print mse mse = labels_and_preds.map( lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count() print mse
def trainevaluatemodel_gbdt(traindata,validationdata,loss,numiterations,learningrate,maxdepth,maxbins): starttime=time() model=GradientBoostedTrees.trainClassifier(traindata, categoricalFeaturesInfo={}, loss=loss, numIterations=numiterations, learningRate=learningrate,maxDepth=maxdepth, maxBins=maxbins) index=evaluation(model,validationdata) duration=time()-starttime print('Param:'+'\n'+'loss:'+str(loss)+'\n'+'numiterations:'+str(numiterations)+'\n'+'learningrate:'+str(learningrate)+'\n'+'maxdepth:'+str(maxdepth)+'\n'+'maxbins:'+str(maxbins)+'\n'+'time:'+str(duration)+'\n'+'index:'+str(index)) return (loss,numiterations,learningrate,maxdepth,maxbins,duration,index)
def ctr_gbdt(file_dir): sc = SparkContext(appName="CTRGBDTRegression") path = file_dir + CTR_TRAINING_DATA + "/part*" data = sc.textFile(path) (training_data, testData) = data.randomSplit([0.7, 0.3]) parsed_train_data = training_data.map(_parse_point) parsed_test_data = testData.map(_parse_point) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(parsed_train_data, categoricalFeaturesInfo={}, numIterations=100) # Evaluate model on test instances and compute test error predictions = model.predict(parsed_test_data.map(lambda x: x.features)) labels_and_predictions = parsed_test_data.map(lambda lp: lp.label).zip( predictions) test_err = labels_and_predictions.filter( lambda vp: vp[0] != vp[1]).count() / float(parsed_test_data.count()) logger = logging.getLogger() logger.debug('GBDT Training Error = ' + str(test_err)) logger.debug('Learned classification GBT model:') logger.debug(model.toDebugString()) logger.debug("Tree totalNumNodes" + str(model.totalNumNodes())) # Save and load model ctr_gbdt_data = file_dir + CTR_GBDT_DATA model.save(sc, ctr_gbdt_data) logger.info("GBDT training finished")
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) except ValueError: self.fail()
def Regression_Model(filename): open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data( filename) output = [] for i in range(1, len(Date)): tmp = LabeledPoint(label=True_price_train[i], features=[close_price_train[i]]) output.append(tmp) output_train_RDD = sc.parallelize(output).cache() lrm = LinearRegressionWithSGD.train(output_train_RDD, step=0.001, iterations=100000) tree = DecisionTree.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=30) forest = RandomForest.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=30) gradient = GradientBoostedTrees.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numIterations=10) print("\n============MODEL Evaluation=============\n") model_name = [ 'LinearRegression', 'DecisionTree', 'RandomForest', 'GradientBoostedTrees' ] es_modelname = ['lrm', 'tree', 'forest', 'gradient'] result = '' x = 0 err = 1000 test_model = 'LinearRegression' #此处更换不同的RDD output_model_RDD = lrm for model in [lrm, tree, forest, gradient]: predictions = model.predict(output_train_RDD.map(lambda x: x.features)) labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip( predictions) MSE = ( labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(output_train_RDD.count()))**0.5 #print ("Predictions: ", valuesAndPreds.take(10)) result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n" if (err > MSE): err = MSE output_model = model es_model = es_modelname[x] x += 1 print(result) print(es_model) return Date, True_price, output_model_RDD, open_price, close_price, es_model
def main(): text = sc.textFile(inputs) nltk_data_path = "[change yo your own nltk_data location]" # maybe changed to the sfu server path nltk.data.path.append(nltk_data_path) cleaned_review = text.map(clean_reviewf).cache() reviews_txt = cleaned_review.map(lambda review: review['reviewText']) reviews = cleaned_review.map(lambda review: (review['overall'], review['reviewText'], review['reviewTime'])).cache() training_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year < 2014) testing_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year == 2014) training_data = training_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache() testing_data = testing_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache() training_rating = training_data.map(lambda ((rating, review_text), review_index): (review_index, rating)) training_review_text = training_data.map(lambda ((rating, review_text), review_index): (review_index, review_text)) training_review_text_flat = training_review_text.flatMapValues(myf) training_review_text_flat = training_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index)) testing_rating = testing_data.map(lambda ((rating, review_text), review_index): (review_index, rating)) testing_review_text = testing_data.map(lambda ((rating, review_text), review_index): (review_index, review_text)) testing_review_text_flat = testing_review_text.flatMapValues(myf) testing_review_text_flat = testing_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index)) word2vec_model = generate_word2vec_model(reviews_txt) mv = word2vec_model.getVectors() # this step seems redundant but necessary mvdct = [] for k,v in mv.items(): vec = [f for f in v] mvdct.append((k,vec)) dct_rdd = sc.parallelize(mvdct) training_feature_vecs = dct_rdd.join(training_review_text_flat) training_vecs = training_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1))) training_reduce_vecs = training_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1])) training_avg_vecs = training_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct))) training_rating_avgf = training_rating.join(training_avg_vecs) training_lps = training_rating_avgf.map(get_lp) testing_feature_vecs = dct_rdd.join(testing_review_text_flat) testing_vecs = testing_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1))) testing_reduce_vecs = testing_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1])) testing_avg_vecs = testing_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct))) testing_rating_avgf = testing_rating.join(testing_avg_vecs) testing_lps = testing_rating_avgf.map(get_lp) gbt_model = GradientBoostedTrees.trainClassifier(training_lps, categoricalFeaturesInfo={}, numIterations=20) predictions = gbt_model.predict(testing_lps.map(lambda x: x.features)) labelsAndPredictions = testing_lps.map(lambda lp: lp.label).zip(predictions) MSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /float(testing_lps.count()) RMSE = math.sqrt(MSE) result = str(RMSE) outdata = sc.parallelize([result]) outdata.saveAsTextFile(output)
def cross_validation_gb(Data_1, Data_2, Data_3, loss_type, num_iter, maxDepth): # Training the model using Gradient Boosted Trees regressor model_train_1 = GradientBoostedTrees.trainRegressor( Data_1.union(Data_2), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features)) labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1) testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_3.count()) model_train_2 = GradientBoostedTrees.trainRegressor( Data_2.union(Data_3), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features)) labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2) testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_1.count()) model_train_3 = GradientBoostedTrees.trainRegressor( Data_3.union(Data_1), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features)) labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3) testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_2.count()) return (testMSE_1 + testMSE_2 + testMSE_3) / 3
def testRegression(trainingData, testData, model_path): # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3, maxDepth=4) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() / float(testData.count()) print("Test Mean Squared Error = " + str(testMSE)) print("Learned regression GBT model:") print(model.toDebugString()) model.save(sc, model_path)
def testClassification(trainingData, testData): # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count() / float(testData.count()) print("Test Error = " + str(testErr)) print("Learned classification ensemble model:") print(model.toDebugString())
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def validation_gb(trainingData,testData, loss_type, num_iter, maxDepth): # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions = model_train.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) return testMSE
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def GBDT_train(data, filename): data_train = split_data(data, [0.5, 0.5]) key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1])) training, test = key_FT.randomSplit([0.8, 0.2], 0) model_GBDT = GradientBoostedTrees.trainClassifier(training, {}, numIterations=20) predictionAndlabel = test.map( lambda x: (float(model_GBDT.predict(x.features)), x.label)) accuracy = 1.0 * predictionAndlabel.filter( lambda (x, v): x == v).count() / test.count() print("accuracy of model_GBDT:%f" % accuracy) pre_all(data, model_GBDT, filename) return model_GBDT, accuracy
def testRegression(trainingData, testData): # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \ / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression ensemble model:') print(model.toDebugString())
def main(): #Reading train and test data trainData = sc.pickleFile(input+'/Train_data.average/part-00000') testData = sc.pickleFile(input+'/Test_data.average/part-00000') parsedData=trainData.map(parseInput).filter(lambda line:len(line.features)!=0 or len(line.label)!=0) parsedTestData = testData.map(parseInput).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache() model = GradientBoostedTrees.trainRegressor(parsedData,categoricalFeaturesInfo={}, numIterations=1) predictions = model.predict(parsedTestData.map(lambda x: x.features)) labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip(predictions) validationErr = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedTestData.count()) parsedTestData.unpersist() RMSE=math.sqrt(validationErr) print("Root Mean Squared Error Test= " + str(RMSE))
def validation_gb(trainingData, testData, loss_type, num_iter, maxDepth): # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor( trainingData, categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions = model_train.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) return testMSE
def testClassification(trainingData, testData): # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda v_p: v_p[0] != v_p[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification ensemble model:') print(model.toDebugString())
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth): removelist_train = set( ['stars', 'business_id', 'bus_id', 'b_id', 'review_id', 'user_id']) newlist_train = [ v for i, v in enumerate(train_data.columns) if v not in removelist_train ] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train = (transformed_train.select( "features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor( sc.parallelize(data_train.collect(), 5), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Creating a list of features to be used for predictions removelist_final = set( ['business_id', 'bus_id', 'b_id', 'review_id', 'user_id']) newlist_final = [ v for i, v in enumerate(test_data.columns) if v not in removelist_final ] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final, outputCol="features") transformed_final = assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map( lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def Gradient_BoostedTrees(filename, sc): # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString())
def fit(model, df, param): spark = model['spark'] sc = spark.sparkContext feature_variables = param['feature_variables'] target_variable = param['target_variables'][0] iterations = 10 if 'options' in param: if 'params' in param['options']: if 'iterations' in param['options']['params']: iterations = int(param['options']['params']['iterations']) sdf = spark.createDataFrame(df) rdd = sdf.rdd.map(lambda row: LabeledPoint(row[ target_variable], [row[x] for x in feature_variables])) model['model'] = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo={}, numIterations=iterations) info = {"message": "model trained"} return info
def crossValidator(IterNums,dataset_rdd,rate): dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5) dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5) # dataset_positive1,dataset_positive2,dataset_positive3,dataset_positive4,dataset_positive5 = dataset_positive.randomSplit([1,1,1,1,1]) # dataset_negotive1,dataset_negotive2,dataset_negotive3,dataset_negotive4,dataset_negotive5 = dataset_negotive.randomSplit([1,1,1,1,1]) dataset_positive_list = dataset_positive.randomSplit([1,1,1,1,1]) dataset_negotive_list = dataset_negotive.randomSplit([1,1,1,1,1]) result = [] #result2 = [] for i in range(5): testset_positive = dataset_positive_list[i].count() testset_rdd = dataset_positive_list[i].union(dataset_negotive_list[i]) testset_count = testset_rdd.count() trainset_rdd = dataset_rdd.subtract(testset_rdd) trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) model = GradientBoostedTrees.trainClassifier(trainset, {}, numIterations=IterNums,learningRate = rate) #model2 = LogisticRegressionWithLBFGS.train(trainset,iterations = 100) predictions = model.predict(testset.map(lambda x:x.features)) #predictions2 = model2.predict(testset.map(lambda x:x.features)) predict = testset.map(lambda lp: lp.label).zip(predictions) #predict2 = testset.map(lambda lp:lp.label).zip(predictions2) hitALL =predict.filter(lambda e:e[0]==e[1]).count() #hitALL2 = predict2.filter(lambda e:e[0]==e[1]).count() hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count() #hitPositive2 = predict2.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count() positive = predict.filter(lambda e:e[1]>0.5).count() #positive2 = predict2.filter(lambda e:e[1]>0.5).count() recall = hitPositive/float(testset_positive) #recall2 = hitPositive2/float(testset_positive) precision = hitPositive/float(positive) #precision2 = hitPositive2/float(positive2) accuracy = hitALL/float(testset_count) #accuracy2 = hitALL2/float(testset_count) F_Value = 2/(1/precision+1/recall) #F_Value2 = 2/(1/precision2+1/recall2) result.append((precision,recall,accuracy,F_Value,hitPositive,positive,testset_positive,testset_count))
def precreate_models(train_data): models = list() for depth in range(9, 10): for num_trees in range(4, 10, 3): for impurity in ['entropy']: # ['gini', 'entropy'] for feature in [ 'onethird' ]: # ['auto', 'all', 'sqrt', 'log2', 'onethird'] models.append( RandomForest.trainClassifier( train_data, numClasses=10, categoricalFeaturesInfo={}, numTrees=num_trees, featureSubsetStrategy=feature, impurity=impurity, maxDepth=depth, maxBins=32)) for iters in range(9, 10): for rate in np.linspace(0.1, 1, 2): for depth in range(9, 10): for loss in [ 'leastSquaresError' ]: # ['logLoss', 'leastSquaresError', 'leastAbsoluteError'] models.append( GradientBoostedTrees.trainClassifier( train_data, categoricalFeaturesInfo={}, loss=loss, numIterations=iters, learningRate=rate, maxDepth=depth)) return models
def main(): records = get_records() first = records.first() records.cache() # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) first_point = data.first() gbt_model = GradientBoostedTrees.trainRegressor(data, categoricalFeaturesInfo={}, numIterations=3) true_vs_predicted_gbt = data.map(lambda p: (p.label, gbt_model.predict(p.features))) predictions = gbt_model.predict(data.map(lambda x: x.features)) labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions) print "GradientBoosted Trees predictions: " + str( labelsAndPredictions.take(5)) mse = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(data.count()) mae = labelsAndPredictions.map(lambda (v, p): np.abs(v - p)).sum() /\ float(data.count()) rmsle = labelsAndPredictions.map(lambda (v,p) : ((np.log(p + 1) - np.log(v + 1))**2)).sum() /\ float(data.count()) print('Gradient Boosted Trees - Mean Squared Error = ' + str(mse)) print('Gradient Boosted Trees - Mean Absolute Error = ' + str(mae)) print('Gradient Boosted Trees - Mean Root Mean Squared Log Error = ' + str(rmsle))
# We have to do something here to cache the dataset, otherwise it hangs later on due to a PySpark bug num_records = training_data.count() print(" * Transformed data read!") print(" * Training test ML model... ") # Label the data points labeled_data = training_data.map(lambda x: LabeledPoint(x[-1], x[:-1])) # Separate training and testing data train_data, test_data = labeled_data.randomSplit([0.8, 0.2]) # Do something again to avoid the PySpark bug hang from manifesting num_train_recs = train_data.count() num_test_recs = test_data.count() # Train the model ml_model = GradientBoostedTrees.trainRegressor(train_data, {}, numIterations=20, loss='leastAbsoluteError') print(" * Model trained!") print(" * Testing model error... ") # Predict and calculate error metrics predictions = ml_model.predict(test_data.map(lambda r: r.features)) predictions = predictions.zip(test_data.map(lambda r: r.label)) metrics = RegressionMetrics(predictions) print(" * Model regression error metrics: ") print(" - Mean Absolute Error: %.2f" % metrics.meanAbsoluteError) print(" - Mean Squared Error: %.2f" % metrics.meanSquaredError) print(" - Root Mean Squared Error: %.2f" % metrics.rootMeanSquaredError)
trainingData = trainingData.cache() testData = testData.cache() from time import time errors={} cfi = {} depth = 9 lr = 0.3 lossfunc = "logLoss" #stopER = 0.27498 stopER = 0.2745 testER = 1.0 attemption = 0 while testER>=stopER: start=time() model=GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo=cfi,loss="logLoss",maxDepth=depth,numIterations=10,learningRate=lr) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in ['test','train']: # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp:lp.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err if name=='test': if Err>=stopER: attemption+=1 break else: testER = Err print depth,errors[depth]
from pyspark.context import SparkContext from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint def parsePoint(line): values = [float(x.strip()) for x in line.split(',')] return LabeledPoint(values[-1],values[1:10]) data = sc.textFile("heart_disease.csv") data = sc.textFile("heart_disease.csv") data = data.map(parsePoint) (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) # This works too! train = sc.textFile("train.csv") def parsePoint(line): values = [float(x.strip()) for x in line.split(',')] return LabeledPoint(values[-1],values[:65]) train = train.map(parsePoint) model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo={}, numIterations=300, maxDepth=2,learningRate=0.1) test = sc.textFile("test.csv") test = test.map(parsePoint) predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
path='/covtype/covtype.data' inputRDD=sc.textFile(path) Label=2.0 Data = inputRDD.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint((V[-1]==Label), V[:-1])).cache() (trainingData,testData) = Data.randomSplit([0.7,0.3],seed=255) from time import time errors={} catInfo = {} for i in range(10,54): catInfo[i] = 2 for depth in [10]: start=time() model=GradientBoostedTrees.trainClassifier(trainingData,learningRate = 0.2, numIterations = 30, maxDepth = depth, categoricalFeaturesInfo=catInfo) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print errors
Data=inputRDD.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint(1.0, V[:-1]) if V[-1] == 2.0 else LabeledPoint(0.0, V[:-1])).cache() # ### Reducing data size # In[11]: (trainingData,testData)=Data.randomSplit([0.7,0.3],seed=255) trainingData.cache() testData.cache() # ### Gradient Boosted Trees # In[13]: from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel errors={} for depth in [14]: model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=15, maxDepth=depth) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth]
from pyspark.mllib.util import MLUtils from pyspark.mllib.regression import LabeledPoint # API docs # https://spark.apache.org/docs/1.4.0/api/python/pyspark.mllib.html # Function to convert .csv files into 'LabeledPoint' format def parsePoint(line): values = [float(x.strip()) for x in line.split(',')] return LabeledPoint(values[-1],values[:65]) # Load .csv data train_csv = sc.textFile("train.csv") test_csv = sc.textFile("test.csv") # Convert the data to LabeledPoint format train_parsed = train_csv.map(parsePoint) test_parsed = test_csv.map(parsePoint) # Build a GBM / TreeNet model model = GradientBoostedTrees.trainClassifier( train_parsed, loss='leastSquaresError', o categoricalFeaturesInfo={},numIterations=300, maxDepth=2,learningRate=0.1) # Get predictions and see how it did predictions = model.predict(test_parsed.map(lambda x: x.features)) labelsAndPredictions = test_parsed.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda t: t[0] != t[1]).count() / float(test_parsed.count()) print(testErr)
# $example off$ if __name__ == "__main__": sc = SparkContext( appName="PythonGradientBoostedTreesClassificationExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingClassificationModel") sameModel = GradientBoostedTreesModel.load( sc, "target/tmp/myGradientBoostingClassificationModel")
# data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields datamodel = dmt.computeDataModel(df) DataModelTools.checkTargetForModelType(datamodel,target,model_type) # use DataModelTools to convert from DataFrame to an RDD of LabelledPoint for specified target/predictors lp = dmt.extractLabelledPoint(df,target,predictors).map(lambda x:x[1]).cache() # build the decision tree model from pyspark.mllib.tree import GradientBoostedTrees if model_type == "classification": model = GradientBoostedTrees.trainClassifier( lp, categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors), loss=loss_param, numIterations=numIterations_param, learningRate=learningRate_param, maxDepth=maxDepth_param, maxBins=maxBins_param) else: # regression model = GradientBoostedTrees.trainRegressor( lp, categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors), loss=loss_param, numIterations=numIterations_param, learningRate=learningRate_param, maxDepth=maxDepth_param, maxBins=maxBins_param) build_report = mbr.report(lp.count(),lp.getNumPartitions(),
# In[8]: #Split the training set and test set (trainingData, testData) = data.randomSplit([0.7, 0.3]) # In[9]: #Training model RF_model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=32) GB_model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # In[10]: #Predication def cal_mllib_accuracy(list): for i, clf in enumerate(list): #prediction with the features predictions = clf.predict(testData.map(lambda x: x.features)) #append with lables first then features labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) accuracy = labelsAndPredictions.filter(lambda (v, p): v == p).count()/testData.count() #compare results
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingRegressionModel") sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingRegressionModel") # $example off$
# * **maxDepth** – Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 3) # * **maxBins** – maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories # # # * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are: # * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context. # * `load(sc,path)` : The counterpart to save - load classifier from file. # * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints. # * `toDebugString()` : print the classifier in a human readable format. errors={} catInfo = {} for i in range(10,54): catInfo[i] = 2 depth = 13 model=GradientBoostedTrees.trainClassifier(trainingData,categoricalFeaturesInfo=catInfo,maxDepth=depth,numIterations=13,learningRate = 0.15) #print model.toDebugString() errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth] # coding: utf-8
all_data = np.array(zip(yy, xx)) sss = ShuffleSplit(len(all_data) - 1, test_size=0.20, random_state=1234) for train_indexes, test_indexes in sss: lparr = [] test_lp_arr = [] sample_data = all_data[train_indexes] test_data = all_data[test_indexes] for medianvalue, record in sample_data: lp = LabeledPoint(medianvalue, tuple(record)) lparr.append(lp) for medianvalue, record in test_data: lp = LabeledPoint(medianvalue, tuple(record)) test_lp_arr.append(lp) training_data = sc.parallelize(lparr).cache() test_data_rdd = sc.parallelize(test_lp_arr).cache() regression_model = GradientBoostedTrees.trainRegressor(training_data, categoricalFeaturesInfo={}, numIterations=10,maxDepth=10) result = regression_model.predict(test_data_rdd.map(lambda x: x.features)) print regression_model print regression_model.toDebugString() print "===============================" predicted_data = result.collect() actual_data = test_data_rdd.map(lambda x: float(x.label)).collect() print mean_absolute_error(actual_data, predicted_data) break
#model=SVMWithSGD.train(train, 1.0) ### Change XCA # TODO We are testing several MLs # 1) LogisticsRegression #model =LogisticRegressionwWithSGD.train(train) This is used for Logistic regression classification # 2) SVM Classification #model=SVMWithSGD.train(train) This used for SVM classiffication # 3) RandomForest #************Random forest model in pyspark is experimental so not sure whether works perfectly or not #model=RandomForest.trainClassifier(train,2,{},300,seed=2) here 300 is best solution as per literature for this dataset ##### from doc #model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, # numTrees=3, featureSubsetStrategy="auto", # impurity='gini', maxDepth=4, maxBins=32) # Gradient Boost model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) print "retrieving predictions and evaluating" predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print "accuracy for GradientBoostedTrees:"+str(accuracy)
# In[45]: Data1 = Data.sample(False, 0.1, seed=255).cache() (trainingData, testData) = Data1.randomSplit([0.7, 0.3], seed=255) # print 'Sizes: Data1=%d, trainingData=%d, testData=%d'%(Data1.count(),trainingData.cache().count(),testData.cache().count()) # In[59]: from time import time errors = {} for depth in [10]: model = GradientBoostedTrees.trainClassifier(Data1, categoricalFeaturesInfo={}, numIterations=10, maxDepth=depth, learningRate=0.25, maxBins=54) #print model.toDebugString() errors[depth] = {} dataSets = {'train': trainingData, 'test': testData} for name in dataSets.keys(): # Calculate errors on train and test sets data = dataSets[name] Predicted = model.predict(data.map(lambda x: x.features)) LabelsAndPredictions = data.map(lambda lp: lp.label).zip(Predicted) Err = LabelsAndPredictions.filter( lambda (v, p): v != p).count() / float(data.count()) errors[depth][name] = Err print depth, errors[depth] # In[ ]:
def parsePoint2(line): values= [float(x) for x in line.split(',')] return LabeledPoint(values[0], values[1:]) #train data load train_data_new = sc.textFile('/home/hduser/dataset.txt') parsedData = train_data_new.map(parsePoint) #test data load test_data_new = sc.textFile('/home/hduser/testfile.txt') test_final = test_data_new.map(parsePoint2) # Split train and test X_train, X_test = parsedData.randomSplit([0.8,0.2]) #train the classifier model=GradientBoostedTrees.trainClassifier(X_train,categoricalFeaturesInfo={},numIterations=10) #20% of training data predictions=model.predict(X_test.map(lambda x: x.features)) labelsAndPredictions1 = X_test.map(lambda p: p.label).zip(predictions) #test data predictions1=model.predict(test_final.map(lambda x: x.features)) y_final = test_final.map(lambda p: p.label).zip(predictions1) er =labelsAndPredictions1.filter(lambda (v, p): v != p).count() / float(X_train.count()) acc = (1 - er)*100 print('===============================================================') print(model.toDebugString()) print('===============================================================') for i in y_final.collect():
print("Number of test set rows: %d" % test_data.count()) # COMMAND ---------- # MAGIC %md ### Train Gradient Boosted trees and Random Forest model # COMMAND ---------- from pyspark.mllib.tree import RandomForest from time import * from pyspark.mllib.tree import GradientBoostedTrees start_time = time() # Train a model Gradient Boosted Trees modelGBT = GradientBoostedTrees.trainClassifier(training_data, categoricalFeaturesInfo={}) end_time = time() elapsed_time_GBT = end_time - start_time print("Time to train GBT model: %.3f seconds" % elapsed_time_GBT) # Train a model Random Forest start_time = time() model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, \ numTrees=3, featureSubsetStrategy="auto", impurity="gini", \ maxDepth=4, maxBins=32, seed=SEED) end_time = time() elapsed_time_RF = end_time - start_time print("Time to train Random Forest model: %.3f seconds" % elapsed_time_RF)
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils # Read the file into an RDD # If doing this on a real cluster, you need the file to be available on all nodes, ideally in HDFS. path='/HIGGS/HIGGS.csv' inputRDD=sc.textFile(path) # Transform the text RDD into an RDD of LabeledPoints Data=inputRDD.map(lambda line: [float(strip(x)) for x in line.split(',')]) .map(lambda x: LabeledPoint(x[0], x[1:])) Data1=Data.sample(False,0.1, seed=255).cache() (trainingData,testData)=Data1.randomSplit([0.7,0.3],seed = 255) trainingData.cache() testData.cache() errors={} depth = 10 model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=30, learningRate=0.3, maxDepth=depth) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth]
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
trainingData = trainingData.map(lambda x: LabeledPoint(x[0], x[1])) # <=================================================================================================================> # RandomForest Classifier maxDepth_selection = [5, 10, 15, 20, 30] maxBins_selection = [10, 20, 30, 40] model_rf = RandomForest.trainClassifier(trainingData, numClasses = 8, \ numTrees = 800, featureSubsetStrategy = "auto", \ impurity = 'gini', maxDepth = 5, maxBins = 30) predictions_rf = model_rf.predict(testData.map(lambda x: x[1])) labelsAndPredictions_rf = testData.map(lambda x: x[0]).zip(predictions_rf) testErr_rf = labelsAndPredictions_rf.filter( lambda (v, p): v != p).count() / float(testData.count()) print "Precision is " + testErr_rf # <=================================================================================================================> # Gradient Boost Decision Tree # tunning order learningRate_selection = [0.1, 0.2, 0.3] maxDepth_selection = [5, 10, 15, 20, 30] model_xgbt = GradientBoostedTrees.trainClassifier(trainingData, numClasses = 8, \ loss = 'logLoss', numIterations = 800, \ learningRate = 0.1, maxDepth = 10) predictions_xgbt = model_xgbt.predict(testData.map(lambda x: x[1])) labelsAndPredictions_xgbt = testData.map(lambda x: x[0]).zip(predictions_xgbt) testErr_xgbt = labelsAndPredictions_xgbt.filter( lambda (v, p): v != p).count() / float(testData.count()) print "Precision is " + testErr_xgbt
import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim.txt') traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_ssim.txt') data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_720.txt') data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_540.txt') data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_360.txt') model = GradientBoostedTrees.trainRegressor(traindata, categoricalFeaturesInfo={}, numIterations=5) predictions = model.predict(data.map(lambda x: x.features)) labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float( data.count()) print("training MSE = " + str(MSE)) labelsandpredictions.saveAsTextFile("/usr/hadoop/ssim_rbt") predictions_720 = model.predict(data_720.map(lambda x: x.features)) labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip( predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) * (v - p)).sum() / float(data_720.count()) print("training MSE_720 = " + str(MSE_720)) labelsandpredictions_720.saveAsTextFile("/usr/hadoop/ssim_720_rbt") predictions_540 = model.predict(data_540.map(lambda x: x.features))
for x in featurs_raw: feature = float(x.strip().strip("'").strip()) features.append(feature) label = float(fields[11]) #print ("label=" + str(label)) return LabeledPoint(label,features) data = sc.textFile("/Users/jiayangan/project/SearchAds/data/log/ctr_features_demo3/part*") (trainingData, testData) = data.randomSplit([0.7, 0.3]) parsedTrainData = trainingData.map(parsePoint) parsedTestData = testData.map(parsePoint) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(parsedTrainData, categoricalFeaturesInfo={}, numIterations=100,maxDepth=3) # Evaluate model on test instances and compute test error predictions = model.predict(parsedTestData.map(lambda x: x.features)) labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(parsedTestData.count()) print('training Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString()) print("tree totalNumNodes" + str(model.totalNumNodes())) # Save and load model model.save(sc, "/Users/jiayangan/project/SearchAds/data/model/ctr_gbdt_model_demo_20")
##### Trees ##### ##### Now let’s try three variants of tree-based classification. ##### The API is slightly different from previous algos. from pyspark.mllib.tree import DecisionTree from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.tree import RandomForest algo = DecisionTree() model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={}) score(model) algo = GradientBoostedTrees() model = algo.trainClassifier(training_data,categoricalFeaturesInfo={},numIterations=10) score(model) algo = RandomForest() model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16) score(model) #### Naive Bayes #### Last but not least, let’s try the Naives Bayes classifier. from pyspark.mllib.classification import NaiveBayes algo = NaiveBayes() model = algo.train(training_data) score(model)
# * **maxBins** – maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories # # # * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are: # * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context. # * `load(sc,path)` : The counterpart to save - load classifier from file. # * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints. # * `toDebugString()` : print the classifier in a human readable format. # In[32]: from time import time errors={} for depth in [10]: start=time() model=GradientBoostedTrees.trainClassifier(trainingData, {},maxDepth=depth, numIterations=30)##FILLIN to generate 10 trees ##) #print model.toDebugString() errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) ### FILLIN ### Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth]#,int(time()-start),'seconds' #print errors # In[33]:
Err = 0.0 results = [] for train_index, test_index in ss: X_training, Y_training, X_test, Y_test = [], [], [], [] for i in train_index: X_training.append(X[i]) Y_training.append(Y[i]) for i in test_index: X_test.append(X[i]) Y_test.append(Y[i]) parsedData = [] for i in range(0, len(X_training)): parsedData.append(LabeledPoint(Y_training[i], X_training[i])) model = GradientBoostedTrees.trainClassifier(sc.parallelize(parsedData), {}, numIterations=10) testErr = 0 for i in range(0, len(X_test)): a = Y_test[i] b = model.predict(X_test[i]) #b = 1 if a != b: testErr += 1 Err += float(testErr) / float(len(X_test)) print ("AVG test error: %.6f" % (Err/iter_number))
# In[6]: Data1=Data.sample(False,0.1, seed=255).cache() (trainingData,testData)=Data1.randomSplit([0.7,0.3],seed=255) # ###Gradient Boosted Trees # In[7]: from time import time errors={} for depth in [10]: model=GradientBoostedTrees.trainClassifier(Data1, categoricalFeaturesInfo={}, numIterations=10, maxDepth=depth, learningRate=0.25, maxBins=35) #print model.toDebugString() errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted=model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth] # In[ ]:
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
from pyspark import SparkConf, SparkContext SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\") import sys, pickle,math from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('random-forest') sc = SparkContext(conf=conf) input = sys.argv[1] # Load and parse the data def parsePoint(line): return LabeledPoint(float(line[1]), line[0]) train = sc.pickleFile(input+'/bow_train/part-00000') test = sc.pickleFile(input+'/bow_test/part-00000') parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0) parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache() model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1) predictions = model.predict(parsedtest.map(lambda x: x.features)) labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions) val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count()) parsedtest.unpersist() RMSE=math.sqrt(val_err) print("Root Mean Squared Error Test= " + str(RMSE))