def rfTest(sqlContext,dataset_rdd): dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5) dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5) train_positive = dataset_positive.sample(False,0.8) test_positive = dataset_positive.subtract(train_positive) train_negotive = dataset_negotive.sample(False,0.8) test_negotive = dataset_negotive.subtract(train_negotive) trainset_rdd = train_positive.union(train_negotive) testset_rdd = test_positive.union(test_negotive) trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) trainset_nums = trainset.count() testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) testset_nums = testset.count() trainset_positive = train_positive.count() testset_positive = test_positive.count() model = RandomForest.trainClassifier(trainset,2,{},3) predictions = model.predict(testset.map(lambda x:x.features)) predict = testset.map(lambda lp: lp.label).zip(predictions) hitALL =predict.filter(lambda e:e[0]==e[1]).count() hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count() positive = predict.filter(lambda e:e[1]>0.5).count() recallPositive = hitPositive/float(testset_positive) precision = hitPositive/float(positive) accuracy = hitALL/float(testset.count()) F_Value = 2/(1/precision+1/recallPositive) return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
def Random_Forest(filename, sc): filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt" # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, filename) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) # Save and load model #model.save(sc, "target/tmp/myRandomForestClassificationModel") #sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
def generateRandomForest(): if os.path.exists(RF_PATH): print("RF_PATH Already available") return data = sc.textFile(F_PATH).map(parseLine) (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L) # Train a RandomForest model. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={}, numTrees=4, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error', str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) modelStatistics(labelsAndPredictions) # Save and load model model.save(sc, RF_PATH) print("Saved RF Model.")
def testOnce (): # split the data into training and testing sets (trainingData, testData) = data.randomSplit([1-test_size, test_size]) # train the random forest model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=num_trees, featureSubsetStrategy = strat, impurity='gini', maxDepth = max_depth, maxBins=32) # test the random forest predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count()) Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count()) Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count()) Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count()) probsAndScores = probTest(testData, model) threshold_accuracy = probsAndScores[0] probs = probsAndScores[1].map(lambda x: x/num_trees) labelsAndPredictions = labelsAndPredictions.zip(probs) labelsAndProbs = testData.map(lambda lp: lp.label).zip(probs) save(labelsAndProbs, 'answers') print ('Galaxy Purity = ' + str(Ng / (Ng+Ms))) print ('Galaxy Completeness = ' + str(Ng / (Ng+Mg))) print ('Star Purity = ' + str(Ns / (Ns+Mg))) print ('Star Completeness = ' + str(Ns/(Ns+Ms))) print ('Accuracy = ' + str(1 - testErr)) print ('Threshold method accuracy = ' + str(threshold_accuracy))
def main(): sc = SparkContext(appName="MyApp") sc.setLogLevel('ERROR') # Parse data train_labels, train_data = load_data('train.csv') dummy_labels, test_data = load_data('test.csv', use_labels=False) # Truncate the last 2 features of the data for dataPoint in train_data: len = np.size(dataPoint) dataPoint = np.delete(dataPoint, [len - 2, len - 1]) for dataPoint in test_data: len = np.size(dataPoint) dataPoint = np.delete(dataPoint, [len - 2, len - 1]) # Map each data point's label to its features train_set = reformatData(train_data, train_labels) test_set = reformatData(test_data, dummy_labels) # Parallelize the data parallelized_train_set = sc.parallelize(train_set) parallelized_test_set = sc.parallelize(test_set) # Split the data trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42) # Train the models randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={}, numTrees=750, seed=42, maxDepth=30, maxBins=32) # Test the model testRandomForest(randomForestModel, parallelized_test_set)
def main(): input_train = sys.argv[1] input_test = sys.argv[2] conf = SparkConf().setAppName('Sentiment Analysis with Random Forest') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' train = sc.textFile(input_train).cache() test = sc.textFile(input_test).cache() '''sbaronia - get training and testing labeled points''' train_lp = train.map(to_labeledpoint).cache() test_lp = test.map(to_labeledpoint).cache() '''sbaronia - run RandomForest regression on our training data with default options except numTrees = 5''' rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) '''sbaronia - run predictions on testing data and calculate RMSE value''' predictions = rf_model.predict(test_lp.map(lambda x: x.features)) labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions) rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count())) print("RMSE = " + str(rmse))
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def trainRandomForestModel(data): """ Train a random forest regression model and return it :param data: RDD[LabeledPoint] :return: random forest regression model """ from pyspark.mllib.tree import RandomForest model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32) return model
def train_model(cls, trianData, cateFeaInfo={}, trees=3, impurity="gini",\ depth=4): """ 训练模型 """ model = RandomForest.trainClassifier(trainData, numClasses=2,\ categoricalFeaturesInfo=cateFeaInfo, numTrees=trees, \ featureSubsetStrategy="auto", impurity=impurity, maxDepth=depth,\ maxBins=32) return model
def trainModel(trainingData): print "\nTrainning Random Forest model started!" Utils.logTime() model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=32) print '\nTraining Random Forest model finished' Utils.logTime() return model
def evaluate(self, trainingData, testData=None, metric=None): if testData !=None: model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=10, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) else: #cross validation pass
def getRandomForestRMSE(trees_array): valRMSE_list = [] for trees in trees_array: model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={}, numTrees=trees, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features)) labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions) valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count()) valRMSE=valMSE**0.5 valRMSE_list.append((trees, valRMSE)) return valRMSE_list
def trainOptimalModel(trainingData, testData): print "\nTraining optimal Random Forest model started!" Utils.logTime() numTreesVals = [3,5,8] featureSubsetStrategyVals = ['auto','all','sqrt','log2','onethird'] impurityVals = ['gini', 'entropy'] maxDepthVals = [3,4,5,6,7] maxBinsVals = [8,16,32] optimalModel = None optimalNumTrees = None optimalFeatureSubsetStrategy = None optimalMaxDepth = None optimalImpurity = None optimalBinsVal = None minError = None try: for curNumTree in numTreesVals: for curFeatureSubsetStrategy in featureSubsetStrategyVals: for curImpurity in impurityVals: for curMaxDepth in maxDepthVals: for curMaxBins in maxBinsVals: model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=curNumTree, featureSubsetStrategy=curFeatureSubsetStrategy, impurity=curImpurity, maxDepth=curMaxDepth, maxBins=curMaxBins) testErr = Evaluation.evaluate(model, testData) if testErr < minError or not minError: minError = testErr optimalNumTrees = curNumTree optimalFeatureSubsetStrategy = curFeatureSubsetStrategy optimalImpurity = curImpurity optimalMaxDepth = curMaxDepth optimalBinsVal = curMaxBins optimalModel = model except: msg = "\nException during model training with below parameters:" msg += "\tnum trees: " + str(optimalNumTrees) msg += "\tfeature subset strategy: " + optimalFeatureSubsetStrategy msg += "\timpurity: " + str(curImpurity) msg += "\tmaxDepth: " + str(curMaxDepth) msg += "\tmaxBins: " + str(curMaxBins) Utls.logMessage(msg) logMessage(optimalModel, optimalNumTrees, optimalFeatureSubsetStrategy, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError) return optimalModel
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def testRegression(trainingData, testData): # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\ .sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') print(model.toDebugString())
def testClassification(trainingData, testData): # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count()\ / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString())
def train_trend_model(self, model, data, i): self.logger.info('Start to train the direction model') rdd_data = self.sc.parallelize(data) if self.trend_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='gini', maxDepth=20, maxBins=32) elif self.trend_prediction_method == self.NAIVE_BAYES: model = NaiveBayes.train(rdd_data) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) elif self.trend_prediction_method == self.SVM: model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) return model
def create_model(name, training): if name == 'logistic': print_box() print "Logistic Regression Model" print_box() model = LogisticRegressionWithLBFGS.train(training) elif name == 'tree': print_box() print "Decision Tree Model" print_box() model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) elif name == 'rf': print_box() print "Random Forest Model" print_box() model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50) return model
def kfolds (): #folds = kFold(data, k) this would work in java acc = 0 spurity = 0 scomp = 0 gpurity = 0 gcomp = 0 foldsize = data.count()/k tested = sc.parallelize([]) for i in range(k): test = sc.parallelize(data.subtract(tested).takeSample(False, foldsize)) tested = tested.union(test) train = data.subtract(test) # train the random forest model = RandomForest.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={}, numTrees=num_trees, featureSubsetStrategy="auto", impurity='gini', maxDepth = max_depth, maxBins=32) predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count()) Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count()) Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count()) Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count()) Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count()) gpurity += (Ng / (Ng+Ms)) gcomp += (Ng / (Ng+Mg)) spurity += (Ns / (Ns+Mg)) scomp += (Ns/(Ns+Ms)) acc += (1 - testErr) print 'with '+ str(k) + ' folds:' print ('Average Galaxy Purity = ' + str(gpurity / k)) print ('Average Galaxy Completeness = ' + str(gcomp / k)) print ('Average Star Purity = ' + str(spurity / k)) print ('Average Star Completeness = ' + str(scomp / k)) print ('Average Accuracy = ' + str(acc / k))
def train_amount_model(self, model, data, i): rdd_data = self.sc.parallelize(data) self.logger.info('Start to train the amount model') if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK: input_num = self.feature_num layers = [input_num, input_num / 3 * 2, input_num / 3, 1] neural_network = NeuralNetworkSpark(layers=layers, bias=0) model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001, iteration=15, model=model) elif self.amount_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='variance', maxDepth=20, maxBins=32) elif self.amount_prediction_method == self.LINEAR_REGRESSION: model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=model.weights if model is not None else None) else: self.logger.error("Unknown training method {}".format(self.amount_prediction_method)) raise ValueError("Unknown training method {}".format(self.amount_prediction_method)) return model
data = sc.textFile("team_result.txt") data = data.map(lambda line: line.split(",")) data = data.map(lambda x: LabeledPoint(float(x[5]), [x[0], x[1], x[2], x[3], x[4]])) # Split the dataset into training set (70%) and test set (30%) trainingData, testData = data.randomSplit([0.7, 0.3], seed=1071) # Create and train the naive Bayes model naiveBayesModel = NaiveBayes.train(trainingData, 1.0) # Apply the model to the test set predictionAndLabelNaiveBayes = testData.map(lambda x: (naiveBayesModel.predict(x.features), x.label)) # Calculate the accuracy of the model errorNaiveBayes = 1.0 * predictionAndLabelNaiveBayes.filter(lambda (x, y): x != y).count() / testData.count() print "Naive Bayes model classification error: {0:f}".format(errorNaiveBayes) # Create and train the random forest model randomForestModel = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={0: 9, 1: 9, 2: 9, 3: 9, 4: 9}, numTrees=3, impurity="gini", maxDepth=4, maxBins=32, seed=1071) ''' Note taken from the official API documentation: In Python, predict cannot currently be used within an RDD transformation or action. Call predict directly on the RDD instead. ''' predictionsRandomForest = randomForestModel.predict(testData.map(lambda x: x.features)) labelsAndPredictionsRF = testData.map(lambda x: x.label).zip(predictionsRandomForest) errorRandomForest = labelsAndPredictionsRF.filter(lambda (x, y): x != y).count() / float(testData.count()) print "Random forest classification error: {0:f}".format(errorRandomForest)
def train_randomforest_model(dataset): model = RandomForest.trainClassifier(dataset, 2, {}, 3, seed=42) return model
val_data = truetestData.map(lambda line: LabeledPoint(line[7], line[0:7])) # debug print(data.take(1)) print(val_data.take(1)) # for holdout validation (trData, tData) = data.randomSplit([0.7, 0.3]) # random forest training model mod = RandomForest.trainRegressor(trData, categoricalFeaturesInfo={ 0: 13, 1: 1499, 2: 2 }, numTrees=4, featureSubsetStrategy="auto", impurity='variance', maxDepth=8, maxBins=1500) # prediction and evaluation predictions = mod.predict(tData.map(lambda x: x.features)) pred = mod.predict(val_data.map(lambda x: x.features)) labelsAndPredictions = tData.map(lambda lp: lp.label).zip(predictions) truePred = val_data.map(lambda lp: lp.label).zip(pred) metrics = RegressionMetrics(labelsAndPredictions) met2 = RegressionMetrics(truePred) # Squared Error print("Validation MSE = %s" % metrics.meanSquaredError)
(model1.predict(p.features), p.label)) #model 2 from pyspark.mllib.classification import SVMWithSGD model2 = SVMWithSGD.train(training, iterations=100) predictionAndLabel_SVM = test.map(lambda p: (model2.predict(p.features), p.label)) #model 3 from pyspark.mllib.tree import RandomForest model = RandomForest.trainClassifier(training, numClasses=6, numTrees=2, categoricalFeaturesInfo={}, featureSubsetStrategy="auto", maxDepth=6, maxBins=32) predictions = model.predict(test.map(lambda x: x.features)) predictionAndLabel_RF = test.map(lambda lp: lp.label).zip(predictions) # -------------- La phase prediction ------------# def accuracy(predictionAndLabel): return 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / test.count() print('model accuracy {}'.format(accuracy(predictionAndLabel_NB))) print('model accuracy {}'.format(accuracy(predictionAndLabel_SVM)))
# Evaluate model on test instances and compute test error predictions = GBTmodel.predict(test_dense.rdd.map(lambda x: x.features.values)) labelsAndPredictions = test_dense.rdd.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(test_dense.rdd.count()) print('Test Error = ' + str(testErr)) from pyspark.mllib.tree import RandomForest, RandomForestModel print('Learned classification RF model:') train_start = time.time() RFmodel = RandomForest.trainClassifier(labelPoint_train, numClasses=2, categoricalFeaturesInfo={}, numTrees=30, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) train_end = time.time() print(f'Time elapsed training model: {train_end - train_start} seconds') predictions = RFmodel.predict(test_dense.rdd.map(lambda x: x.features.values)) labelsAndPredictions = test_dense.rdd.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(test_dense.rdd.count()) print('Test Error = ' + str(testErr)) spark.stop()
def takeAndPrint(time, rdd, num=1000): sqlContext = SQLContext(sc) result = [] taken = rdd.take(num + 1) print("-------------------------------------------") print("Time: %s" % time) print("-------------------------------------------") for record in taken[:num]: vals = tuple(record.split(",")) #[tuple(['Alice', '1'])] result.append(vals) print(type(result)) print(result) df = sqlContext.createDataFrame(result).collect() df.show() # Dataframe for MLLIB's # panda dataframe sample_data = df.sample(False, 0.5, 83).toPandas() sample_data.head() # find category and numerical variables numeric_cols = [ "account_length", "number_vmail_messages", "total_day_minutes", "total_day_calls", "total_day_charge", "total_eve_minutes", "total_eve_calls", "total_eve_charge", "total_night_minutes", "total_night_calls", "total_intl_minutes", "total_intl_calls", "total_intl_charge" ] categorical_cols = [ "state", "international_plan", "voice_mail_plan", "area_code" ] #some plots ax = sb.boxplot(x="churned", y="number_customer_service_calls", data=sample_data, palette="Set3") ax.set(xlabel="Churned", ylabel="Number of calls made to the customer service") plt.show() example_numeric_data = sample_data[[ "total_day_minutes", "total_day_calls", "total_day_charge", "churned" ]] sb.pairplot(example_numeric_data, hue="churned", palette="husl") plt.show() # correlation and heatmap corr = sample_data[[ "account_length", "number_vmail_messages", "total_day_minutes", "total_day_calls", "total_day_charge", "total_eve_minutes", "total_eve_calls", "total_eve_charge", "total_night_minutes", "total_night_calls", "total_intl_minutes", "total_intl_calls", "total_intl_charge" ]].corr() sb.heatmap(corr) reduced_numeric_cols = [ "account_length", "number_vmail_messages", "total_day_calls", "total_day_charge", "total_eve_calls", "total_eve_charge", "total_night_calls", "total_intl_calls", "total_intl_charge" ] label_indexer = StringIndexer(inputCol='churned', outputCol='label') plan_indexer = StringIndexer(inputCol='intl_plan', outputCol='intl_plan_indexed') assembler = VectorAssembler(inputCols=['intl_plan_indexed'] + reduced_numeric_cols, outputCol='features') classifier = DecisionTreeClassifier(labelCol='label', featuresCol='features') pipeline = Pipeline( stages=[plan_indexer, label_indexer, assembler, classifier]) (train, test) = df.randomSplit([0.7, 0.3]) model = pipeline.fit(train) # Random forest from pyspark.mllib.tree import RandomForest model2 = RandomForest.trainClassifier(train, numClasses=2, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # SVM needs some tweaking- not working as expected # ROC chart from pyspark.ml.evaluation import BinaryClassificationEvaluator predictions = model.transform(test) evaluator = BinaryClassificationEvaluator() auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
def main(): sc = SparkContext(conf=SparkConf().setAppName("Random Forest")) sqlContext = SQLContext(sc) bytePath = "s3n://eds-uga-csci8360/data/project2/binaries" namePath = "s3n://eds-uga-csci8360/data/project2/labels/X_train_small.txt" nameTestPath = "s3n://eds-uga-csci8360/data/project2/labels/X_test_small.txt" classPath = "s3n://eds-uga-csci8360/data/project2/labels/y_train_small.txt" #bytePath = "/Users/priyanka/Desktop/project2files/all" #namePath = "/Users/priyanka/Desktop/X_train_small.txt" #nameTestPath="/Users/priyanka/Desktop/X_test_small.txt" #classPath = "/Users/priyanka/Desktop/y_train_small.txt" #docData Output: ('file:/Users/priyanka/Desktop/project2files/train/04mcPSei852tgIKUwTJr.bytes', '00401000 20 FF 58 C0 20 FE 5C 01 F8 00 0F 8B 50 FC 06 01\r\n00401010 8C 01 FF") docData = sc.wholeTextFiles( bytePath, 25).map(lambda (x, y): (x.encode("utf-8"), y.encode("utf-8"))) print("docData frankie") docData.take(1) #clean docData here - remove 1st word from line and remove /r/n cleanDocData = docData.map(lambda (x, y): (x, clean(y.split()))) #try calculating tf here (filename,tf) x = 16**2 + 1 hashingTF = HashingTF(x) tfDocData = cleanDocData.map(lambda (x, y): (x, hashingTF.transform(y))) tfDocData.take(1) #Output format : (index,filename) nameData = sc.textFile( namePath, 25).map(lambda x: "file:" + bytePath + "/" + x + ".bytes" ).zipWithIndex().map(lambda (x, y): (y, x)) #nameData.take(5) #Output format: (index,label) labelData = sc.textFile( classPath, 25).zipWithIndex().map(lambda (x, y): (y, str(int(x) - 1))) #Output format: (filename,label) joinNameLabel = nameData.join(labelData).map(lambda (x, y): y) #joinNameLabel.take(5) #Output: (label,tfidf) joinCleanDocLabel = joinNameLabel.join(tfDocData).map(lambda (x, y): y) #Output: (label,tfidf) hashData = joinCleanDocLabel.map(lambda (label, text): LabeledPoint(label, text)) print "hashing TF done" print("generating model fliss") model1 = RandomForest.trainClassifier(hashData, numClasses=9, categoricalFeaturesInfo={}, numTrees=50, featureSubsetStrategy="auto", impurity='gini', maxDepth=8, maxBins=32) #============================================================================== # Testing starts here #============================================================================== #Output: (filename,index) nameTestData = sc.textFile(nameTestPath, 25).map( lambda x: "file:" + bytePath + "/" + x + ".bytes").zipWithIndex() #Output: (index,tfidf) joinTestDocLabel = nameTestData.join(tfDocData).map(lambda (x, y): y) print("hashing test kenny") hashTestData = joinTestDocLabel.map( lambda (label, text): LabeledPoint(label, text)) hashTestData.persist() #Random forest prediction and labels and accuracy print "prediction part lyndz" prediction1 = model1.predict(hashTestData.map(lambda x: x.features)) prediction1.saveAsTextFile("/Users/priyanka/Desktop/pred.txt")
# Evaluate model on test instances and compute test error predictions = modelDT.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testAccuracy = labelsAndPredictions.filter( lambda (v, p): v == p).count() / float(testData.count()) print('Decision Tree Test Accuracy =' + str(testAccuracy)) # Print the predictions print("Acutual vs Predicted values - Decision Tree") print(labelsAndPredictions.collect()) # Train a RandomForest model modelRF = RandomForest.trainClassifier(trainingData, numClasses=3, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = modelRF.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testAccuracy = labelsAndPredictions.filter( lambda (v, p): v == p).count() / float(testData.count()) print('Random Forest Test Accuracy =' + str(testAccuracy)) # Print the predictions print("Acutual vs Predicted values - Random Forest") print(labelsAndPredictions.collect())
results = [] for train_index, test_index in ss: X_training, Y_training, X_test, Y_test = [], [], [], [] for i in train_index: X_training.append(X[i]) Y_training.append(Y[i]) for i in test_index: X_test.append(X[i]) Y_test.append(Y[i]) parsedData = [] for i in range(0, len(X_training)): parsedData.append(LabeledPoint(Y_training[i], X_training[i])) model = RandomForest.trainClassifier(sc.parallelize(parsedData), 2, {}, 3, seed=42) testErr = 0 for i in range(0, len(X_test)): a = Y_test[i] b = model.predict(X_test[i]) #b = 1 if a != b: testErr += 1 Err += float(testErr) / float(len(X_test)) print("AVG test error: %.6f" % (Err / iter_number))
# categorical = range(0,30) + range(35,39) + range(41,46) + range(48,57) # data.cache() # mappings = [get_mapping(data, i) for i in categorical] labelpoints = data.map(lambda x: LabeledPoint(x[-1], x[:-1])) return labelpoints data = label_points(data_raw) training, testing = data.randomSplit([0.5, 0.5], 0) model = RandomForest.trainClassifier(training, numClasses=7, categoricalFeaturesInfo={}, numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxBins=32) predictions = model.predict(testing.map(lambda x: x.features)) labelsAndPredictions = testing.map(lambda lp: lp.label).zip(predictions) accuracy = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( testing.count()) print accuracy # # # https://books.google.com/books?id=syPHBgAAQBAJ&pg=PA166&lpg=PA166&dq=categorical+variables+labeledpoint+pyspark&source=bl&ots=X9VyTR348v&sig=cMf8rZlpbdWcyCl2jSPNU1Var6k&hl=en&sa=X&ved=0ahUKEwjPpofhh8XMAhVI1WMKHXoqCio4ChDoAQgbMAA#v=onepage&q=categorical%20variables%20labeledpoint%20pyspark&f=false # # Page 166 # def get_mapping(rdd, idx): # return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()
def train_random_forest(trainRDD, num_trees, max_depth): return RandomForest.trainClassifier(trainRDD, 2, {}, num_trees, maxDepth=max_depth)
from pyspark.mllib.evaluation import MulticlassMetrics trainDataPath = "s3://cloudpa2/TrainingDataset.csv" valDataPath = "s3://cloudpa2/ValidationDataset.csv" sc = SparkContext.getOrCreate() sc.getConf().setAppName('cloudpa2') rawData = sc.textFile(trainDataPath) records = rawData.filter(lambda str: "\"" not in str).map( lambda str: str.split(";")).map(lambda strVal: [float(x) for x in strVal]) data = records.map( lambda arr: LabeledPoint(int(arr[-1]) - 1, Vectors.dense(arr[:-1]))) valRawData = sc.textFile(valDataPath) valRecords = valRawData.filter(lambda str: "\"" not in str).map( lambda str: str.split(";")).map(lambda strVal: [float(x) for x in strVal]) valData = valRecords.map(lambda arr: (arr[:-1], int(arr[-1]) - 1)) rfModel = RandomForest.trainClassifier(data, numClasses=10, categoricalFeaturesInfo={}, numTrees=100) rfModelPredictionAndLabels = rfModel.predict( valData.map(lambda tp: tp[0])).zip(valData.map(lambda _: float(_[1]))) rfModelMetric = MulticlassMetrics(rfModelPredictionAndLabels) print("F1Score : %s" % (rfModelMetric.accuracy)) rfModel.save(sc, "s3://myprogrambucket123/rfwine_model.model")
dataPath = 'train_svm'# 'data/mllib/sample_libsvm_data.txt' if len(sys.argv) == 2: dataPath = sys.argv[1] if not os.path.isfile(dataPath): sc.stop() usage() points = MLUtils.loadLibSVMFile(sc, dataPath) # Re-index class labels if needed. (reindexedData, origToNewLabels) = reindexClassLabels(points) numClasses = len(origToNewLabels) # Train a classifier. categoricalFeaturesInfo = {} # no categorical features #model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses, # categoricalFeaturesInfo=categoricalFeaturesInfo) model = RandomForest.trainClassifier(reindexedData, numClasses=numClasses,categoricalFeaturesInfo={},numTrees=30,featureSubsetStrategy='auto', impurity='gini', maxDepth=8, maxBins=40, ) # Print learned tree and stats. print origToNewLabels print "Trained DecisionTree for classification:" # print " Model numNodes: %d" % model.numNodes() # print " Model depth: %d" % model.depth() print " Training accuracy: %g" % getAccuracy(model, reindexedData) # if model.numNodes() < 20: # print model.toDebugString() # else: # print model print model # testdata = MLUtils.loadLibSVMFile(sc, 'test_svm') #reuben predictions = model.predict(testdata.map(lambda x: x.features)) # labels = testdata.map(lambda l:l.label)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
df = spark.read.format('com.databricks.spark.csv').csv( 's3://cs643-wine/TrainingDataset.csv', header=True, sep=";") # def parsePoint(line): # # values = [float(x) for x in line.split(';')] # return LabeledPoint(values[11], values[0:10]) parsedTData = df.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[:11]))) #Training data using Random Forest model = RandomForest.trainClassifier(parsedTData, numClasses=11, categoricalFeaturesInfo={}, numTrees=3, impurity='gini', maxDepth=4, maxBins=32) # vData = sc.textFile("ValidationDataset.csv") # header = vData.first() # rows = vData.filter(lambda x: x != header) vdf = spark.read.format('com.databricks.spark.csv').csv( 's3://cs643-wine/ValidationDataset.csv', header=True, sep=";") parsedVData = vdf.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[:11]))) predictions = model.predict(parsedVData.map(lambda x: x.features))
# Class 18.0 precision = 0.0 # Class 18.0 recall = 0.0 # Class 18.0 F1 Measure = 0.0 # Class 19.0 precision = 0.0 # Class 19.0 recall = 0.0 # Class 19.0 F1 Measure = 0.0 # Class 20.0 precision = 0.0 # Class 20.0 recall = 0.0 # Class 20.0 F1 Measure = 0.0 training = training_random.rdd.map(lambda row: LabeledPoint(row['label'], row['raw_Features'].toArray())) test = test_random.rdd.map(lambda row: LabeledPoint(row['label'], row['raw_Features'].toArray())) #======== RandomForest rf_model = RandomForest.trainClassifier(training,21,{}, 50, seed=1000) rf_model.totalNumNodes()#402 ## Compute raw scores on the test set # predictionAndLabels = test.map(lambda lp: (float(rf_model.predict(lp.features)), lp.label)) #doesnot work # an other way predictions = rf_model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) predictionAndLabels=labelsAndPredictions.map(lambda x: (x[1],x[0] )) #infact , there is no need to switch metrics = MulticlassMetrics(predictionAndLabels) #metrics = MulticlassMetrics(labelsAndPredictions) # the same as above #predictionAndLabels – an RDD of (prediction, label) pairs.
#Preparing the training data from pyspark.mllib.regression import LabeledPoint from numpy import array data_raw = train_data.rdd.map(lambda x: LabeledPoint(x[12], array((x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8] ,x[9], x[10], x[11]), dtype=float))) (trainingData, testData) = data_raw.randomSplit([0.7, 0.3]) # In[ ]: #Training the random forest model from pyspark.mllib.tree import RandomForest, RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={0:13, 1:13, 5:13, 11:14}, numTrees=4, featureSubsetStrategy="auto", impurity='gini', maxDepth=7, maxBins=15) # In[ ]: #Testing the trained model on the test data and evaluating error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString())
for country_from in country_list: for country_to in country_list: print("Country from: ", country_from, " Country to: ", country_to) try: df2 = df.filter(df.Country_from == country_from).filter( df.Country_to == country_to) df_temp = df2.select(df2.Scrap_time.cast("float"),'Airline1_Back','Airline2_There','Airline2_Back'\ ,'Airline1_There',df2.Days.cast("float"),df2.Journey_time.cast("float"), df2.Full_Price.cast("float")) for nazwa in nazwy: indexer = StringIndexer(inputCol=nazwa, outputCol=nazwa + "Index") df_temp = indexer.fit(df_temp).transform(df_temp) df_temp = df_temp.select('Airline1_BackIndex','Airline2_ThereIndex','Airline2_BackIndex','Airline1_ThereIndex','Scrap_time',\ 'Days','Journey_time', 'Full_Price') transformed = transData(df_temp) test = transformed.rdd.map(lambda row: LabeledPoint( row['label'], row['features'].toArray())) model = RandomForest.trainRegressor(test, categoricalFeaturesInfo={}, numTrees=30, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) model.save(sc, "modele/" + country_from + "_" + country_to) except: print("Puść jeszcze raz ", country_from, " ", country_to)
# MLUtils.saveAsLibSVMFile(data, "hdfs:///hndata/spam_docvecs") # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. rr = RandomForest.trainClassifier( trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, ) predictions = rr.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) posErr = ( float(labelsAndPredictions.filter(lambda (v, p): v == 0.0 and v != p).count()) / testData.filter(lambda lp: lp.label == 0.0).count() ) negErr = ( float(labelsAndPredictions.filter(lambda (v, p): v == 1.0 and v != p).count()) / testData.filter(lambda lp: lp.label == 1.0).count() )
} # feature 1 has 53 categories, 0 ..to .. 52 (corresponding to week 1 .. 53) # [(crimes, [beat, week, temp])] # feature 0: beat # feature 1: week # feature 2: temp # featuresDic = {} # for all continuous predictors maxBins = max( len(beatsDic), len(weekDic) ) # ecisionTree requires maxBins >= max categories in categorical features (304) ### Fit model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo=featuresDic, numTrees=10, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=maxBins) ### Evalute # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\ .sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') # print(model.toDebugString()) ### Compute R2 SSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) *
labeled_data = labeled_indices.leftOuterJoin(keyfirst_train).map( lambda _: (_[1][1], _[0])) unlabeled_data = unlabeled_indices.leftOuterJoin(keyfirst_train).map( lambda _: (_[1][1], _[0])) print('labeled = ', labeled_indices.count(), ' unlabeled = ', unlabeled_indices.count()) if unlabeled_indices.isEmpty(): break n_estimators = 10 model = RandomForest.trainClassifier(labeled_data.map(lambda _: _[0]), numClasses=2, categoricalFeaturesInfo={}, numTrees=n_estimators, featureSubsetStrategy="auto", impurity='gini') ''' accuracy test on testset here''' predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda _: _[0] != _[1]) n_unlabeled = unlabeled_data.count() rdd = sc.parallelize([]) for tree in model._java_model.trees(): predX = DecisionTreeModel(tree).predict(unlabeled_data.map(lambda _ : _[0].features))\ .zipWithIndex()\ .map(lambda _: (_[1], _[0])) rdd = rdd.union(predX)
features_modeled_train, features_categorical_indexed_vec_train) ## select the one-hot-encoded categorical features along with numerical features as well as label to contrust the modeling dataset df_train_modeling = df_train.select(features_modeled_train) ## df_train_modeling_rdd for mllib package df_train_modeling_rdd = df_train_modeling.map( lambda p: convert_sparsevec_to_vec_df( p, features_categorical_indexed_vec_index_train)) df_train_modeling_rdd = df_train_modeling_rdd.map( lambda l: LabeledPoint(l[0], l[1:])) ################################################## 5: train random forest regression model ## random forest ## train model rfModel = RandomForest.trainRegressor(df_train_modeling_rdd, categoricalFeaturesInfo={}, numTrees=100, featureSubsetStrategy="auto", impurity='variance', maxDepth=10, maxBins=32) # Predict on train data predictions = rfModel.predict( df_train_modeling_rdd.map(lambda l: l.features)) ## Evaluation of the model predictionAndObservations = predictions.zip( df_train_modeling_rdd.map(lambda l: l.label)) testMetrics = RegressionMetrics(predictionAndObservations) model_time = str(model_time[0][0]) df_model_performance = sqlContext.createDataFrame( sc.parallelize( [[model_time, testMetrics.rootMeanSquaredError, testMetrics.r2]]), ["model_time", "RMSE", "R2"])
print("Number of training set rows: %d" % training_data.count()) print("Number of test set rows: %d" % test_data.count()) # COMMAND ---------- from pyspark.mllib.tree import RandomForest from time import * start_time = time() model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=25, featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, seed=13579) end_time = time() elapsed_time = end_time - start_time print("Time to train model: %.3f seconds" % elapsed_time) # COMMAND ---------- predictions = model.predict(test_data.map(lambda x: x.features)) labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions) acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float( test_data.count())
l2.extend([float(a[j])]) test_features.append(l2) l = f.readline() f.close() # Random Forest # C14 - C21 # Build the Model rf_trees = [25, 50, 100, 150, 200, 250, 300] for i in range(0, len(rf_trees), 1): num_rf_trees = rf_trees[i] # Build the Model model = RandomForest.trainClassifier(train_data, 2, {}, num_rf_trees) rf_train_predict_label = [] rf_test_predict_label = [] # Predict Labels for j in range(0, len(test_features), 1): p_l = model.predict(test_features[j]) rf_test_predict_label.extend([p_l]) for j in range(0, len(train_features), 1): p_l = model.predict(train_features[j]) rf_train_predict_label.extend([p_l]) # Append Labels appendColumn(ensemble_test, rf_test_predict_label)
# from pyspark.mllib.tree import RandomForest, RandomForestModel # model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, # numTrees=3, featureSubsetStrategy="auto", # impurity='gini', maxDepth=4, maxBins=32) # evaluate(model,trainingData,testData) # In[16]: import pandas as pd from pyspark.mllib.tree import RandomForest, RandomForestModel for n in [1, 2, 4, 8, 16, 32, 64, 100]: start = time.time() model = RandomForest.trainClassifier(parsedData, numClasses=2, categoricalFeaturesInfo={}, numTrees=n, featureSubsetStrategy="auto", impurity='gini', maxDepth=30, maxBins=32) taken = time.time() - start taken pd.DataFrame([[filename, taken, n]]).to_csv('rf_spark.txt', mode='a', index=False, header=False) #from pyspark.mllib.classification import SVMWithSGD, SVMModel #model = SVMWithSGD.train(trainingData, iterations=100) #evaluate(model,trainingData,testData) # Gradient Boosted Trees
# ------------------------------------------------------------------------------ # Step 5(a): # Parameters for the Random Forest model # ------------------------------------------------------------------------------ RANDOM_SEED = 10904 RF_NUM_TREES = 100 RF_MAX_DEPTH = 4 RF_MAX_BINS = 100 # ------------------------------------------------------------------------------ # Step 5(b): # Training a Random Forest model on the dataset # ------------------------------------------------------------------------------ model = RandomForest.trainClassifier(transformed_train_df, numClasses=2, categoricalFeaturesInfo={}, \ numTrees=RF_NUM_TREES, featureSubsetStrategy="log2", impurity="entropy", \ maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED) # ------------------------------------------------------------------------------ # Step 5©: # Make predictions and compute accuracy # ------------------------------------------------------------------------------ predictions = model.predict(transformed_test_df.map(lambda x: x.features)) labels_and_predictions = transformed_test_df.map(lambda x: x.label).zip(predictions) model_accuracy = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(transformed_test_df.count()) print("Model accuracy: %.3f%%" % (model_accuracy * 100)) # ------------------------------------------------------------------------------ # Step 5(d):
sc = spark.sparkContext # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/diamonds.data') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=123) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=9, categoricalFeaturesInfo={}, numTrees=25, featureSubsetStrategy="auto", impurity='gini', maxDepth=15, maxBins=32, seed=123) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) result = testData.zip(predictions).collect() # Print the predictions to output file with open('machine_learning/results/predicted_cut.txt', 'w') as f: for i in result:
target_data = keyed_data.join(keyed_target) labled_point_data = target_data.map(lambda tup: LabeledPoint(tup[1][1][0], tup[1][0][0].split(','))) #map(lambda line: line.split(",")).map(lambda line: tuple((feature for feature in line))) # Split the data into training and test sets (30% held out for testing) print("Creating Training and Test Data Split") (trainingData, testData) = labled_point_data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=5, featureSubsetStrategy="auto", impurity='variance', maxDepth=8, maxBins=32) # # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) testAccuracy = labelsAndPredictions.map(lambda (v, p): 1 if (abs(v - p) < 10) else 0).sum() / float(testData.count()) print('Total Accuracy = ' + str(testAccuracy)) # print('Learned regression forest model:') # print(model.toDebugString()) # # Save and load model
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: os.removedirs(temp_dir) except OSError: pass
test_lp_arr = [] sample_data = all_data[train_indexes] test_data = all_data[test_indexes] for survived, record in sample_data: lp = LabeledPoint(survived, tuple(record)) lparr.append(lp) for survived, record in test_data: lp = LabeledPoint(survived, tuple(record)) test_lp_arr.append(lp) training_data = sc.parallelize(lparr).cache() test_data_rdd = sc.parallelize(test_lp_arr).cache() classificationModel = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=3) result = classificationModel.predict(test_data_rdd.map(lambda x: x.features)) print classificationModel print classificationModel.toDebugString() print "===============================" predicted_data = result.collect() actual_data = test_data_rdd.map(lambda x: float(x.label)).collect() print mean_absolute_error(actual_data, predicted_data) print accuracy_score(actual_data,predicted_data) print(classificationModel) #for p in predicted_data: # print p break
def train(self, trainingData): self.model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=10, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
from pyspark.ml.classification import RandomForestClassifier from pyspark.mllib.tree import RandomForestModel from pyspark.mllib.evaluation import MulticlassMetrics from prettytable import PrettyTable sc = SparkContext() spark = SparkSession(sc) inputDF = spark.read.csv('TrainingDataset.csv', header='true', inferSchema='true', sep=';') featureColumns = [c for c in inputDF.columns if c != 'quality'] transformed_df = inputDF.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) model = RandomForest.trainClassifier(transformed_df, numClasses=10, categoricalFeaturesInfo={ }, numTrees=50, maxBins=64, maxDepth=20, seed=33) # model.save(sc,"s3://wineprediction/model_created.model") validDF = spark.read.csv( '/testdata/*.csv', header='true', inferSchema='true', sep=';') datadf = validDF.rdd.map(lambda row: LabeledPoint( row[-1], Vectors.dense(row[0:-1]))) predictions = model.predict(datadf.map(lambda x: x.features)) labels_and_predictions = datadf.map(lambda x: x.label).zip(predictions) acc = labels_and_predictions.filter( lambda x: x[0] == x[1]).count() / float(datadf.count())
def main(): options = parse_args() sc = SparkContext(appName="PythonRandomForestClassificationExample") pm.init(sc) # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, options.data_file) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=options.num_classes, categoricalFeaturesInfo={}, numTrees=options.num_trees, featureSubsetStrategy="auto", impurity='gini', maxDepth=options.max_depth, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) print("Using mlops to report statistics") # Adding multiple points (to see a graph in the ui) pm.set_stat("numTrees", options.num_trees, st.TIME_SERIES) pm.set_stat("numClasses", options.num_classes, st.TIME_SERIES) pm.set_stat("maxDepth", options.max_depth, st.TIME_SERIES) pm.set_stat("testError", testErr, st.TIME_SERIES) # TODO: this should be removed once we have better tests for mlops pm.set_stat("stat1", 1.0, st.TIME_SERIES) pm.set_stat("stat1", 2.0, st.TIME_SERIES) pm.set_stat("stat1", 3.0, st.TIME_SERIES) pm.set_stat("stat1", 4.0, st.TIME_SERIES) pm.set_stat("stat1", 5.0, st.TIME_SERIES) pm.set_stat("stat1", 6.0, st.TIME_SERIES) pm.set_stat("stat1", 7.0, st.TIME_SERIES) pm.set_stat("stat1", 8.0, st.TIME_SERIES) # String pm.set_stat("stat2", "str-value", st.TIME_SERIES) # Vec pm.set_stat("statvec", [4.5, 5.5, 6.6], st.TIME_SERIES) list_of_strings = [] for x in range(1, 10000): list_of_strings.append("{},{},{}".format(x, x + 1, x + 2)) rdd_of_str = sc.parallelize(list_of_strings) rdd = rdd_of_str.map(lambda line: Vectors.dense(line.split(","))) # Histograms and any input stats pm.set_stat("input", rdd, st.INPUT) print("Done reporting statistics") # Save and load model model.save(sc, options.output_model) print("Done saving model to {}".format(options.output_model)) sameModel = RandomForestModel.load(sc, options.output_model) # $example off$ sc.stop() pm.done()
test_lp_arr = [] sample_data = all_data[train_indexes] test_data = all_data[test_indexes] for survived, record in sample_data: lp = LabeledPoint(survived, tuple(record)) lparr.append(lp) for survived, record in test_data: lp = LabeledPoint(survived, tuple(record)) test_lp_arr.append(lp) training_data = sc.parallelize(lparr).cache() test_data_rdd = sc.parallelize(test_lp_arr).cache() classificationModel = RandomForest.trainClassifier( training_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=3) result = classificationModel.predict( test_data_rdd.map(lambda x: x.features)) print classificationModel print classificationModel.toDebugString() print "===============================" predicted_data = result.collect() actual_data = test_data_rdd.map(lambda x: float(x.label)).collect() print mean_absolute_error(actual_data, predicted_data) print accuracy_score(actual_data, predicted_data) print(classificationModel) #for p in predicted_data: # print p break
# categorical = range(0,30) + range(35,39) + range(41,46) + range(48,57) # data.cache() # mappings = [get_mapping(data, i) for i in categorical] labelpoints = data.map(lambda x: LabeledPoint(x[-1], x[:-1])) return labelpoints data = label_points(data_raw) training, testing = data.randomSplit([0.5, 0.5], 0) model = RandomForest.trainClassifier(training, numClasses=7, categoricalFeaturesInfo={}, numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxBins=32) predictions = model.predict(testing.map(lambda x: x.features)) labelsAndPredictions = testing.map(lambda lp: lp.label).zip(predictions) accuracy = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testing.count()) print accuracy # # # https://books.google.com/books?id=syPHBgAAQBAJ&pg=PA166&lpg=PA166&dq=categorical+variables+labeledpoint+pyspark&source=bl&ots=X9VyTR348v&sig=cMf8rZlpbdWcyCl2jSPNU1Var6k&hl=en&sa=X&ved=0ahUKEwjPpofhh8XMAhVI1WMKHXoqCio4ChDoAQgbMAA#v=onepage&q=categorical%20variables%20labeledpoint%20pyspark&f=false # # Page 166 # def get_mapping(rdd, idx): # return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap() # # # cat_len = sum(map(len, mappings))
testFinal.count() testFinal.collect() #For Getting the threshold limit, Using Train dataset (training1, training2) = trainFinal.randomSplit([0.7, 0.3]) training1.collect() model_1 = RandomForest.trainRegressor(training1, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) model_2 = GradientBoostedTrees.trainRegressor(training1, categoricalFeaturesInfo={}, numIterations=3) model_3 = DecisionTree.trainRegressor(training1, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) predictionsRFTrain = model_1.predict(training1.map(lambda x: x.features)) predictionsGBTTrain = model_2.predict(training1.map(lambda x: x.features)) predictionsDTTrain = model_3.predict(training1.map(lambda x: x.features)) predictionsRFTrain.collect() predictionsGBTTrain.collect()
numFeatures = parsed_data.map(lambda x:-1 if x[1].size==0 else x[1][-1]).reduce(max)+1 labeled_data = parsed_data.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1],x[2]))) unbalance_test = data_ans_0827.map(feature_char_to_num).cache() l_unbal_te = unbalance_test.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2]))) #splite data to trainData and testData (trianData, testData) = labeled_data.randomSplit([0.9, 0.1]) len_list = [len(i) for i in fe] col_na_l = [i-1 for i in col_na] #because slice out the first data in vector [1:-2] col_na_l = [i-1 for i in col_na_l if i >= 83] #for drop out the 85th col features_dict = dict(zip(col_na_l, len_list)) #feature dict eg. {1:3, 5:8} model = RandomForest.trainClassifier(trianData, numClasses=2, categoricalFeaturesInfo={}, numTrees=50, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(l_unbal_te.map(lambda x: x.features)) labelsAndPredictions = l_unbal_te.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(l_unbal_te.count()) print('Test Error = ' + str(testErr) +"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") true_positive = labelsAndPredictions.filter(lambda (v,p):v==p and p==1).count()/float(labelsAndPredictions.filter(lambda (v,p):v==1).count()) print "true_positive", true_positive, "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" f_true = labelsAndPredictions.filter(lambda (v,p):v==p and v==1).count()/float(labelsAndPredictions.filter(lambda (v,p):p==1).count()) print "precision=TP/(TP+Fp)", f_true, "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" print "1/0",labelsAndPredictions.filter(lambda (v,p):v==1).count()/float(labelsAndPredictions.filter(lambda (v,p):v==0).count()), "##############################################################################################" #print "False", labeled_data.filter(lambda p:p.label==0).count(), "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2" #print "Positive", labeled_data.filter(lambda p:p.label==1).count(),"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
split = Tokenizer(inputCol="text", outputCol="words") wordsData = split.transform(train_hive_info) my_print('分词完成.......') # 增加TF特征列 hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**10) TF_data = hashingTF.transform(wordsData) my_print('TF特征构造完成.......') # 增加IDF特征列 idf = IDF(inputCol="rawFeatures", outputCol="features").fit(TF_data) final_input_data = idf.transform(TF_data) my_print('IDF特征构造完成.......') train_rdd = final_input_data.select("label", "features") \ .rdd.map(lambda (label, features): (LabeledPoint(label, features.toArray()))) if model_name == 'LogisticRegression': model = LogisticRegressionWithLBFGS.train(train_rdd, numClasses=10) model.save(sc, model_path) elif model_name == 'NaiveBayes': model = NaiveBayes.train(train_rdd) model.save(sc, model_path) else: model = RandomForest.trainClassifier(train_rdd, 10, {}, 10, seed=42) model.save(sc, model_path) my_print('模型训练完成.......')