def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def iterateLasso(iterNums, stepSizes, regParam, train, valid): from pyspark.mllib.regression import LassoWithSGD for numIter in iterNums: for step in stepSizes: alg = LassoWithSGD() model = alg.train(train, intercept=True, iterations=numIter, step=step, regParam=regParam) rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label)) validPredicts = valid.map(lambda x: (model.predict(x.features), x.label)) meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0)
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0)
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def iterateLasso(iterNums, stepSizes, regParam, train, valid): from pyspark.mllib.regression import LassoWithSGD for numIter in iterNums: for step in stepSizes: alg = LassoWithSGD() model = alg.train(train, intercept=True, iterations=numIter, step=step, regParam=regParam) rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label)) validPredicts = valid.map(lambda x: (model.predict(x.features), x.label)) meanSquared = math.sqrt( rescaledPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean()) meanSquaredValid = math.sqrt( validPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean()) print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
def LassoModel(dataPath, label, normalize, character, master, ispca): pca_n = 2 sc = SparkContext(master) data = sc.textFile(dataPath) # not RDD data ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)]))) if label == 0: ndata = ndata.map(lambda line: line[::-1]) if normalize == 1: test_data = norm(ndata.collect()) norm_data = sc.parallelize(test_data) train_data = norm_data.map(lambda part: lbp(part[0], part[1])) else: test_data = ndata.map(lambda part: (part[0], part[1:len(part) - 1])).collect() train_data = ndata.map(lambda part: lbp(part[0], part[1: len(part) - 1])) if ispca == 1: pca = PCA(n_components = pca_n) pca_train = [test_data[i][1] for i in range(len(test_data))] pca_data = pca.fit(pca_train).transform(pca_train) test = [] for i in range(len(pca_data)): test.append([test_data[i][0], pca_data[i]]) train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1])) test_data = test model_larg = larg.train(train_data) err_larg = 0.0 size = len(train_data.collect()) for i in range(size): err_larg = err_larg + abs(model_larg.predict(test_data[i][1]) - test_data[i][0]) print "result:", err_larg/size String = "Lasso Regression Result:\n" String = String + str(model_larg.weights) + '\n' String = String + "Error: " + str(err_larg / size) sc.stop() return String
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def linearRegression_f(mode): if mode == "no_reg": model = LinearRegressionWithSGD.train(parsedData) elif mode == "L1_reg": model = LassoWithSGD.train(parsedData) elif mode == "L2_reg": model = RidgeRegressionWithSGD.train(parsedData) else: print("ERROR Mode") #Evaluate the model on training data # parsedData map method to get {train_data, predict_data} pairs valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) #calculate the key-value pairs to get MSE MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count() return MSE
def test_regression(self): from pyspark.mllib.regression import ( LinearRegressionWithSGD, LassoWithSGD, RidgeRegressionWithSGD, ) from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})), ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def exercise_2(self): """ # Write your Docstring here """ sc = self.spark_context file = sc.textFile("./carat-context-factors-percom.csv") energyRate,batteryHealth,batteryTemperature,batteryVoltage,\ cpuUsage,distanceTraveled,mobileDataActivity,mobileDataStatus,\ mobileNetworkType,networkType,roamingEnabled,screenBrightness,\ wifiLinkSpeed,wifiSignalStrength = [i for i in range(0,14)] data = file.map(lambda line: line.split(";")).map(lambda line: (float(line[energyRate]),line[batteryHealth], float(line[batteryTemperature]),float(line[batteryVoltage]), float(line[cpuUsage]),float(line[distanceTraveled]), line[mobileDataActivity],line[mobileDataStatus], line[mobileNetworkType],line[networkType], float(line[roamingEnabled]),float(line[screenBrightness]), float(line[wifiLinkSpeed]),float(line[wifiSignalStrength]))) data = data.filter(lambda x:((x[screenBrightness]==-1 or(x[screenBrightness]>=0 and x[screenBrightness]<=255)) and\ (x[cpuUsage]>=0 and x[cpuUsage]<=1) and\ (x[distanceTraveled]>=0) and\ (x[wifiSignalStrength]>-100 and x[wifiSignalStrength]<0) and\ (x[batteryTemperature]>=0))) data = data.map(lambda x:LabeledPoint(x[energyRate], [x[cpuUsage],x[screenBrightness], x[wifiSignalStrength], x[batteryTemperature]])) train,test = data.randomSplit([4,1]) lr = LinearRegressionWithSGD.train(train,iterations=100,step=1e-4,intercept=False) print lr#(weights=[4.05918718288e-07,2.01710179227e-05,-3.39410603521e-05,1.70383825251e-05], intercept=0.0) rr = RidgeRegressionWithSGD.train(train,iterations=100,step=1e-4,intercept=False) print rr#(weights=[4.05918453228e-07,2.0170994023e-05,-3.39410381473e-05,1.70383716836e-05], intercept=0.0) l = LassoWithSGD.train(train,iterations=100,step=1e-4,intercept=False) print l#(weights=[0.0,1.96629057526e-05,-3.29054093642e-05,1.56445907401e-05], intercept=0.0) valuesAndPreds = test.map(lambda p: (p.label,lr.predict(p.features), rr.predict(p.features),l.predict(p.features))) count = valuesAndPreds.count() MSE = valuesAndPreds.map(lambda (v,lrp,rrp,lp): ((v - lrp)**2/count, (v - rrp)**2/count,(v - lp)**2/count))\ .reduce(lambda a,b:(a[0]+b[0],a[1]+b[1],a[2]+b[2])) print MSE #(4.7634385303075644e-05, 4.7634387065855108e-05, 4.7873793406702168e-05) return None
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def performLasso(training): model = LassoWithSGD.train(training, iterations = 100, step = 0.001) return model
# Transform the Data TestRDD = TestRDD.map(lambda x: (mappingDates(x[0], authorDate), x[1])) TrainingRDD = TrainingRDD.map( lambda x: ( mappingDates( x[0], authorDate), x[1])) # Create Hashed Vectors TestRDD = TestRDD.map(lambda x: (hashVector(x[0], x[1], 10000))) TrainingRDD = TrainingRDD.map( lambda x: (hashVector(x[0], x[1], 10000))) # Create Labelled Points of Each of the Vectors TrainingRDD = TrainingRDD.map(lambda f_x: LabeledPoint(f_x[0], f_x[1])) # Train Model on The Training Set model = LassoWithSGD.train(TrainingRDD) # Test the Model on the Test Set predictions = [] TestRDD_Array = TestRDD.values().collect() for i in np.arange(0, len(TestRDD_Array)): Prediction_Label = model.predict(np.array(TestRDD_Array[i])) predictions.append(Prediction_Label) TestRDD_Array_Label = TestRDD.keys().collect() for i in np.arange(0, len(TestRDD_Array_Label)): print TestRDD_Array_Label[i], predictions[i] # Stop Watch modelTime = time() - modelTime print('\n############ Processing Completed ##############') print('################################################\n')
from pyspark.mllib.regression import LabeledPoint, LassoWithSGD from numpy import array from pyspark import SparkContext from pyspark.mllib.classification import LogisticRegressionWithLBFGS # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(';')] return LabeledPoint(values[11], values[0:10]) sc = SparkContext("local", "Simple App") data = sc.textFile("../winequality.csv") parsedData = data.map(parsePoint) # Build the model model = LassoWithSGD.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr))
#load and parse the data def parsePoint(line): values = [np.float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[6], values[0:6]) data = sc.textFile("/user/cloudera/hw1/train_nohead.csv") wholedata = sc.textFile("/user/cloudera/hw1/wholedata.csv") parsedData = data.map(parsePoint) parsedWholeData = wholedata.map(parsePoint) #Build the model model = LassoWithSGD.train(parsedData, iterations=100, step=0.1, regParam=0.01) #Evaluate the model valuesAndPreds = parsedWholeData.map(lambda p: (p.label, model.predict(p.features))) RMSE = np.sqrt( valuesAndPreds \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() ) print("lasso output : \n") print("RMSE = {0}\n".format(RMSE)) #save and load model model.save(sc, "/user/cloudera/hw1/results/2015310884_lasso")
import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import LassoWithSGD, LassoModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt') traindata=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt') data_720=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_720.txt') data_540=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_540.txt') data_360=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_360.txt') model = LassoWithSGD.train(traindata) predictions = model.predict(data.map(lambda x:x.features)) labelsandpredictions=data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data.count()) print("training MSE = "+str(MSE)) labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_Lasso") predictions_720 = model.predict(data_720.map(lambda x:x.features)) labelsandpredictions_720=data_720.map(lambda lp: lp.label).zip(predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_720.count()) print("training MSE_720 = "+str(MSE_720)) labelsandpredictions_720.saveAsTextFile("/usr/hadoop/hf_720_Lasso") predictions_540 = model.predict(data_540.map(lambda x:x.features)) labelsandpredictions_540=data_540.map(lambda lp: lp.label).zip(predictions_540) MSE_540 = labelsandpredictions_540.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_540.count()) print("training MSE_540 = "+str(MSE_540)) labelsandpredictions_540.saveAsTextFile("/usr/hadoop/hf_540_Lasso") predictions_360 = model.predict(data_360.map(lambda x:x.features))
autoDataLabelPoint.take(5) #Step 9-7-3. Dividing training and testing data. autoDataLabelPointSplit = autoDataLabelPoint.randomSplit([0.7,0.3]) autoDataLabelPointTrain = autoDataLabelPointSplit[0] autoDataLabelPointTest = autoDataLabelPointSplit[1] autoDataLabelPointTrain.take(5) autoDataLabelPointTest.take(5) autoDataLabelPointTest.count() autoDataLabelPointTrain.count() #Step 9-8-1. Creating a linear regression model with Lasso. from pyspark.mllib.regression import LassoWithSGD as lassoSGD ourModelWithLasso = lassoSGD.train(data = autoDataLabelPointTrain, iterations = 400, step = 0.0005,regParam = 0.05, intercept = True) ourModelWithLasso.intercept ourModelWithLasso.weights #Step 9-8-2. Predicting the data using lasso model. actualDataandLassoPredictedData = autoDataLabelPointTest.map(lambda data : (float(data.label) , float(ourModelWithLasso.predict(data.features)))) actualDataandLassoPredictedData.take(5) #Step 9-8-3. Evaluating the model we have created. from pyspark.mllib.evaluation import RegressionMetrics as rmtrcs ourLassoModelMetrics = rmtrcs(actualDataandLassoPredictedData) ourLassoModelMetrics.rootMeanSquaredError
def modelSelection(argv): if len(argv) < 5: print("The arguments for this script require:\n" + "(hdfs or file):///path/to/filename of the dataset\n" + "supervised/unsupervised\n" + "classifier/regression/clustering\n" + "parameter trying to be guessed\n" + "other parameters\n") else: args = argv[1:] #sets up the RDD dataset = sc.textFile(args[0]) params = args[3:] if args[0][-3:] == "csv": dataset = csvFilterAndMap(dataset, params) elif args[0][-4:] =="json": dataset = jsonFilterAndMap(dataset, params) else: print("This program only supports .csv and .json files") return #Model selection algorithm. Currently goes off of scikit learn's cheat sheet if args[1] == "supervised": labels = dataset.map(lambda x: x[0]) values = dataset.map(lambda x: x[1:]) zipped_data = labels.zip(values).map(lambda x: LabeledPoint(x[0], x[1:])).cache() datasetTraining, datasetTest = zipped_data.randomSplit([.8, .2]) if args[2] == "classification": theModel = NaiveBayes.train(datasetTraining) test_preds = (datasetTest.map(lambda x: x.label).zip(theModel.predict(datasetTest.map(lambda x: x.features)))) predictions = theModel.predict(datasetTest.map(lambda x: x.features)) test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1])))) testing_accuracy = test_metrics.precision() with open('results.txt', 'w+') as f: f.write("accuracy: " + str(testing_accuracy) + "\n") f.write("confusion matrix:\n" + str(test_metrics.confusionMatrix().toArray())) return theModel elif args[2] == "regression": sample = zipped_data.sample(False, .3) model = performRegression(sample, params) if(model == "lasso"): theModel = LassoWithSGD.train(datasetTraining, iterations = 1000, step = 0.001) elif(model == "linear"): theModel = LinearRegressionWithSGD.train(datasetTraining, iterations = 1000, step = 0.001) else: theModel = RidgeRegressionWithSGD.train(datasetTraining, iterations = 1000, step = 0.001) test = (datasetTest.map(lambda x: x.label).zip(theModel.predict(datasetTest.map(lambda x: x.features)))) metrics = RegressionMetrics(test.map(lambda x: (x[0], float(x[1])))) value = metrics.rootMeanSquaredError with open('results.txt', 'w+') as f: f.write(model +" root mean squared error: ") f.write(str(value)) return theModel else: print("Please use rather classification or regression for supervised learning") return elif args[1] == "unsupervised": sample = dataset.sample(False, .3) with open('datapoints.txt', 'w+') as f: f.write("dataset: " + str(dataset.take(10))) f.write('\n\n') if args[2] == "clustering": model = performClustering(sample, params) if(model[0] == "gaussian"): theModel = GuassianMixture.train(dataset, model[1]) else: theModel = KMeans.train(dataset, model[1]) with open('results.txt', 'w+') as f: f.write(str(model)) return theModel else: print("Currently this model selection algorithm only supports clustering for unsupervised algorithms") return
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail() # Verify that maxBins is being passed through GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32) with self.assertRaises(Exception) as cm: GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
def evaluate_model_reg(test,model): valuesAndPreds = test.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() return (MSE) ### LinearRegression with SGD model_lreg_sgd_l2 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType="l2")) model_lreg_sgd_l1 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType="l1")) model_lreg_sgd_l0 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType=None)) ### RidgeRegression model_ridge = evaluate_model_reg(test_reg,RidgeRegressionWithSGD.train(training_reg,iterations=1000, step=0.0001)) ### Lasso model_lasso = evaluate_model_reg(test_reg,LassoWithSGD.train(training_reg,iterations=1000, step =0.0001)) #################### OUTPUTS ################################# print("Testing Error :"+"model_svm_l2 = " + str(model_svm_l2)) print("Testing Error :"+"model_svm_l1 = " + str(model_svm_l1)) print("Testing Error :"+"model_svm_l0 = " + str(model_svm_l0)) print("Testing Error :"+"model_log_lbfgs_l2 = " + str(model_log_lbfgs_l2)) print("Testing Error :"+"model_log_lbfgs_l1 = " + str(model_log_lbfgs_l1)) print("Testing Error :"+"model_log_lbfgs_l0 = " + str(model_log_lbfgs_l0)) print("Testing Error :"+"model_log_sgd_l2 = " + str(model_log_sgd_l2)) print("Testing Error :"+"model_log_sgd_l1 = " + str(model_log_sgd_l1)) print("Testing Error :"+"model_log_sgd_l0 = " + str(model_log_sgd_l0)) print("MSE Error :"+"model_lreg_sgd_l2 = " + str(model_lreg_sgd_l2))
valuesAndPreds = testData.map(lambda p: (p.label, model_ridge.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() test_cnt = valuesAndPreds.count() ridge_RMSE_test = math.sqrt(MSE) valuesAndPreds = trainData.map(lambda p: (p.label, model_ridge.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() train_cnt = valuesAndPreds.count() ridge_RMSE_train = math.sqrt(MSE) # Lasso Regression model_lasso = LassoWithSGD.train(trainData, regParam=0.01, intercept=True) valuesAndPreds = testData.map(lambda p: (p.label, model_lasso.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() test_cnt = valuesAndPreds.count() lasso_RMSE_test = math.sqrt(MSE) valuesAndPreds = trainData.map(lambda p: (p.label, model_lasso.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() train_cnt = valuesAndPreds.count() lasso_RMSE_train = math.sqrt(MSE)
from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import LassoWithSGD, LassoModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt') traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt') data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_720.txt') data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_540.txt') data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_360.txt') model = LassoWithSGD.train(traindata) predictions = model.predict(data.map(lambda x: x.features)) labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float( data.count()) print("training MSE = " + str(MSE)) labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_Lasso") predictions_720 = model.predict(data_720.map(lambda x: x.features)) labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip( predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) * (v - p)).sum() / float(data_720.count()) print("training MSE_720 = " + str(MSE_720)) labelsandpredictions_720.saveAsTextFile("/usr/hadoop/hf_720_Lasso") predictions_540 = model.predict(data_540.map(lambda x: x.features))