Esempio n. 1
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
Esempio n. 2
0
def iterateLasso(iterNums, stepSizes, regParam, train, valid):
  from pyspark.mllib.regression import LassoWithSGD
  for numIter in iterNums:
    for step in stepSizes:
      alg = LassoWithSGD()
      model = alg.train(train, intercept=True, iterations=numIter, step=step, regParam=regParam)
      rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label))
      validPredicts = valid.map(lambda x: (model.predict(x.features), x.label))
      meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
Esempio n. 3
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
Esempio n. 4
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
Esempio n. 5
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
Esempio n. 6
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
Esempio n. 7
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
def iterateLasso(iterNums, stepSizes, regParam, train, valid):
    from pyspark.mllib.regression import LassoWithSGD
    for numIter in iterNums:
        for step in stepSizes:
            alg = LassoWithSGD()
            model = alg.train(train,
                              intercept=True,
                              iterations=numIter,
                              step=step,
                              regParam=regParam)
            rescaledPredicts = train.map(lambda x:
                                         (model.predict(x.features), x.label))
            validPredicts = valid.map(lambda x:
                                      (model.predict(x.features), x.label))
            meanSquared = math.sqrt(
                rescaledPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean())
            meanSquaredValid = math.sqrt(
                validPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean())
            print("%d, %5.3f -> %.4f, %.4f" %
                  (numIter, step, meanSquared, meanSquaredValid))
Esempio n. 9
0
def LassoModel(dataPath, label, normalize, character, master, ispca):

    pca_n = 2
    sc = SparkContext(master)
    data = sc.textFile(dataPath)

# not RDD data 

    ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))

    if label == 0:
        ndata = ndata.map(lambda line: line[::-1])

    if normalize == 1:
        test_data = norm(ndata.collect())    
        norm_data = sc.parallelize(test_data)
        train_data = norm_data.map(lambda part: lbp(part[0], part[1]))   
    

    else:
        test_data = ndata.map(lambda part: (part[0], part[1:len(part) - 1])).collect()
        train_data = ndata.map(lambda part: lbp(part[0], part[1: len(part) - 1]))

    if ispca == 1:
        pca = PCA(n_components = pca_n)
        pca_train = [test_data[i][1] for i in range(len(test_data))]
        pca_data = pca.fit(pca_train).transform(pca_train)

        test = []
        for i in range(len(pca_data)):
            test.append([test_data[i][0], pca_data[i]])

        train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
        test_data = test
    
    model_larg = larg.train(train_data)
    err_larg = 0.0
    size = len(train_data.collect())

    
    for i in range(size):
        err_larg = err_larg + abs(model_larg.predict(test_data[i][1]) - test_data[i][0]) 
    
    print "result:", err_larg/size

    String = "Lasso Regression Result:\n"
    String = String + str(model_larg.weights) + '\n'
    String = String + "Error: " + str(err_larg / size) 

    sc.stop()

    return String
Esempio n. 10
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Esempio n. 11
0
def linearRegression_f(mode):
    if   mode == "no_reg":
         model = LinearRegressionWithSGD.train(parsedData)
    elif mode == "L1_reg":
         model = LassoWithSGD.train(parsedData)
    elif mode == "L2_reg":
         model = RidgeRegressionWithSGD.train(parsedData)
    else:
        print("ERROR Mode")
        
    #Evaluate the model on training data
    # parsedData map method to get {train_data, predict_data} pairs 
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    
    #calculate the key-value pairs to get MSE
    MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count()
    
  
    return MSE
Esempio n. 12
0
    def test_regression(self):
        from pyspark.mllib.regression import (
            LinearRegressionWithSGD,
            LassoWithSGD,
            RidgeRegressionWithSGD,
        )
        from pyspark.mllib.tree import DecisionTree

        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})),
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Esempio n. 13
0
	def exercise_2(self):
		"""
		# Write your Docstring here
		"""
		sc = self.spark_context
		file = sc.textFile("./carat-context-factors-percom.csv")
		energyRate,batteryHealth,batteryTemperature,batteryVoltage,\
			cpuUsage,distanceTraveled,mobileDataActivity,mobileDataStatus,\
			mobileNetworkType,networkType,roamingEnabled,screenBrightness,\
			wifiLinkSpeed,wifiSignalStrength = [i for i in range(0,14)]
		data = file.map(lambda line: line.split(";")).map(lambda line:
			(float(line[energyRate]),line[batteryHealth],
			float(line[batteryTemperature]),float(line[batteryVoltage]),
			float(line[cpuUsage]),float(line[distanceTraveled]),
			line[mobileDataActivity],line[mobileDataStatus],
			line[mobileNetworkType],line[networkType],
			float(line[roamingEnabled]),float(line[screenBrightness]),
			float(line[wifiLinkSpeed]),float(line[wifiSignalStrength])))
		data = data.filter(lambda x:((x[screenBrightness]==-1 or(x[screenBrightness]>=0 and x[screenBrightness]<=255)) and\
							(x[cpuUsage]>=0 and x[cpuUsage]<=1) and\
							(x[distanceTraveled]>=0) and\
							(x[wifiSignalStrength]>-100 and x[wifiSignalStrength]<0) and\
							(x[batteryTemperature]>=0)))
		data = data.map(lambda x:LabeledPoint(x[energyRate],
					[x[cpuUsage],x[screenBrightness], x[wifiSignalStrength], x[batteryTemperature]]))
		train,test = data.randomSplit([4,1])

		lr = LinearRegressionWithSGD.train(train,iterations=100,step=1e-4,intercept=False)
		print lr#(weights=[4.05918718288e-07,2.01710179227e-05,-3.39410603521e-05,1.70383825251e-05], intercept=0.0)
		rr = RidgeRegressionWithSGD.train(train,iterations=100,step=1e-4,intercept=False)
		print rr#(weights=[4.05918453228e-07,2.0170994023e-05,-3.39410381473e-05,1.70383716836e-05], intercept=0.0)
		l = LassoWithSGD.train(train,iterations=100,step=1e-4,intercept=False)
		print l#(weights=[0.0,1.96629057526e-05,-3.29054093642e-05,1.56445907401e-05], intercept=0.0)
		valuesAndPreds = test.map(lambda p: (p.label,lr.predict(p.features),
								rr.predict(p.features),l.predict(p.features)))
		count = valuesAndPreds.count()
		MSE = valuesAndPreds.map(lambda (v,lrp,rrp,lp): ((v - lrp)**2/count,
									(v - rrp)**2/count,(v - lp)**2/count))\
							.reduce(lambda a,b:(a[0]+b[0],a[1]+b[1],a[2]+b[2]))
		print MSE #(4.7634385303075644e-05, 4.7634387065855108e-05, 4.7873793406702168e-05)
		return None
Esempio n. 14
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Esempio n. 15
0
def performLasso(training):
	model = LassoWithSGD.train(training, iterations = 100, step = 0.001)
	return model
        # Transform the Data
        TestRDD = TestRDD.map(lambda x: (mappingDates(x[0], authorDate), x[1]))
        TrainingRDD = TrainingRDD.map(
            lambda x: (
                mappingDates(
                    x[0],
                    authorDate),
                x[1]))
        # Create Hashed Vectors
        TestRDD = TestRDD.map(lambda x: (hashVector(x[0], x[1], 10000)))
        TrainingRDD = TrainingRDD.map(
            lambda x: (hashVector(x[0], x[1], 10000)))
        # Create Labelled Points of Each of the Vectors
        TrainingRDD = TrainingRDD.map(lambda f_x: LabeledPoint(f_x[0], f_x[1]))
        # Train Model on The Training Set
        model = LassoWithSGD.train(TrainingRDD)
        # Test the Model on the Test Set
        predictions = []
        TestRDD_Array = TestRDD.values().collect()
        for i in np.arange(0, len(TestRDD_Array)):
            Prediction_Label = model.predict(np.array(TestRDD_Array[i]))
            predictions.append(Prediction_Label)

        TestRDD_Array_Label = TestRDD.keys().collect()
        for i in np.arange(0, len(TestRDD_Array_Label)):
            print TestRDD_Array_Label[i], predictions[i]

    # Stop Watch
    modelTime = time() - modelTime
    print('\n############ Processing Completed ##############')
    print('################################################\n')
from pyspark.mllib.regression import LabeledPoint, LassoWithSGD
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(';')]
    return LabeledPoint(values[11], values[0:10])


sc = SparkContext("local", "Simple App")
data = sc.textFile("../winequality.csv")
parsedData = data.map(parsePoint)

# Build the model
model = LassoWithSGD.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))
Esempio n. 18
0

#load and parse the data
def parsePoint(line):
    values = [np.float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[6], values[0:6])


data = sc.textFile("/user/cloudera/hw1/train_nohead.csv")
wholedata = sc.textFile("/user/cloudera/hw1/wholedata.csv")

parsedData = data.map(parsePoint)
parsedWholeData = wholedata.map(parsePoint)

#Build the model
model = LassoWithSGD.train(parsedData, iterations=100, step=0.1, regParam=0.01)

#Evaluate the model
valuesAndPreds = parsedWholeData.map(lambda p:
                                     (p.label, model.predict(p.features)))

RMSE = np.sqrt(
    valuesAndPreds \
 .map(lambda (v, p): (v - p)**2) \
 .reduce(lambda x, y: x + y) / valuesAndPreds.count()
)
print("lasso output : \n")
print("RMSE = {0}\n".format(RMSE))

#save and load model
model.save(sc, "/user/cloudera/hw1/results/2015310884_lasso")
import sys
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LassoWithSGD, LassoModel
from pyspark.mllib.util import MLUtils

sc = SparkContext(appName="PythonWordCount")
data=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt')
traindata=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt')
data_720=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_720.txt')
data_540=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_540.txt')
data_360=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_360.txt')

model = LassoWithSGD.train(traindata)

predictions = model.predict(data.map(lambda x:x.features))
labelsandpredictions=data.map(lambda lp: lp.label).zip(predictions)
MSE = labelsandpredictions.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data.count())
print("training MSE = "+str(MSE))
labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_Lasso")
predictions_720 = model.predict(data_720.map(lambda x:x.features))
labelsandpredictions_720=data_720.map(lambda lp: lp.label).zip(predictions_720)
MSE_720 = labelsandpredictions_720.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_720.count())
print("training MSE_720 = "+str(MSE_720))
labelsandpredictions_720.saveAsTextFile("/usr/hadoop/hf_720_Lasso")
predictions_540 = model.predict(data_540.map(lambda x:x.features))
labelsandpredictions_540=data_540.map(lambda lp: lp.label).zip(predictions_540)
MSE_540 = labelsandpredictions_540.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_540.count())
print("training MSE_540 = "+str(MSE_540))
labelsandpredictions_540.saveAsTextFile("/usr/hadoop/hf_540_Lasso")
predictions_360 = model.predict(data_360.map(lambda x:x.features))
Esempio n. 20
0
autoDataLabelPoint.take(5)

#Step 9-7-3. Dividing training and testing data.

autoDataLabelPointSplit = autoDataLabelPoint.randomSplit([0.7,0.3])
autoDataLabelPointTrain = autoDataLabelPointSplit[0]
autoDataLabelPointTest = autoDataLabelPointSplit[1]
autoDataLabelPointTrain.take(5)
autoDataLabelPointTest.take(5)
autoDataLabelPointTest.count()
autoDataLabelPointTrain.count()

#Step 9-8-1. Creating a linear regression model with Lasso.

from pyspark.mllib.regression import LassoWithSGD  as lassoSGD
ourModelWithLasso  = lassoSGD.train(data = autoDataLabelPointTrain, iterations = 400, step = 0.0005,regParam = 0.05, intercept = True)

ourModelWithLasso.intercept

ourModelWithLasso.weights

#Step 9-8-2. Predicting the data using lasso model.

actualDataandLassoPredictedData = autoDataLabelPointTest.map(lambda data : (float(data.label) , float(ourModelWithLasso.predict(data.features))))
actualDataandLassoPredictedData.take(5)

#Step 9-8-3. Evaluating the model we have created.

from pyspark.mllib.evaluation import RegressionMetrics as rmtrcs
ourLassoModelMetrics = rmtrcs(actualDataandLassoPredictedData)
ourLassoModelMetrics.rootMeanSquaredError
Esempio n. 21
0
def modelSelection(argv):
	if len(argv) < 5:
		print("The arguments for this script require:\n" +
				"(hdfs or file):///path/to/filename of the dataset\n" +
				"supervised/unsupervised\n" +
				"classifier/regression/clustering\n" +
				"parameter trying to be guessed\n" +
				"other parameters\n")
	else:
		args = argv[1:]

		#sets up the RDD
		dataset = sc.textFile(args[0])
		params = args[3:]
		if args[0][-3:] == "csv":
			dataset = csvFilterAndMap(dataset, params)
			
		elif args[0][-4:] =="json":
			dataset = jsonFilterAndMap(dataset, params)

		else:
			print("This program only supports .csv and .json files")
			return
		#Model selection algorithm. Currently goes off of scikit learn's cheat sheet
		if args[1] == "supervised":
			labels = dataset.map(lambda x: x[0])
			values = dataset.map(lambda x: x[1:])
			zipped_data = labels.zip(values).map(lambda x: LabeledPoint(x[0], x[1:])).cache()
			datasetTraining, datasetTest = zipped_data.randomSplit([.8, .2])
			
			if args[2] == "classification":
				theModel = NaiveBayes.train(datasetTraining)

				test_preds = (datasetTest.map(lambda x: x.label).zip(theModel.predict(datasetTest.map(lambda x: x.features))))
				predictions = theModel.predict(datasetTest.map(lambda x: x.features))
				test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1]))))
				testing_accuracy = test_metrics.precision()
				
				with open('results.txt', 'w+') as f:
					f.write("accuracy: " + str(testing_accuracy) + "\n")
					f.write("confusion matrix:\n" + str(test_metrics.confusionMatrix().toArray()))
				return theModel

			elif args[2] == "regression":
				sample = zipped_data.sample(False, .3)
				model = performRegression(sample, params)
				if(model == "lasso"):
					theModel = LassoWithSGD.train(datasetTraining, iterations = 1000, step = 0.001)
				elif(model == "linear"):
					theModel = LinearRegressionWithSGD.train(datasetTraining, iterations = 1000, step = 0.001)
				else:
					theModel = RidgeRegressionWithSGD.train(datasetTraining, iterations = 1000, step = 0.001)
				test = (datasetTest.map(lambda x: x.label).zip(theModel.predict(datasetTest.map(lambda x: x.features))))
				metrics = RegressionMetrics(test.map(lambda x: (x[0], float(x[1]))))
				value = metrics.rootMeanSquaredError
				with open('results.txt', 'w+') as f:
					f.write(model +" root mean squared error: ")
					f.write(str(value))
				return theModel

			else:
				print("Please use rather classification or regression for supervised learning")
				return

		elif args[1] == "unsupervised":
			sample = dataset.sample(False, .3)
			with open('datapoints.txt', 'w+') as f:
				f.write("dataset:	" + str(dataset.take(10)))
				f.write('\n\n')

			if args[2] == "clustering":
				model = performClustering(sample, params)
				
				if(model[0] == "gaussian"):
					theModel = GuassianMixture.train(dataset, model[1])
				else:
					theModel = KMeans.train(dataset, model[1])
				with open('results.txt', 'w+') as f:
					f.write(str(model))
				return theModel
			else:
				print("Currently this model selection algorithm only supports clustering for unsupervised algorithms")
				return
Esempio n. 22
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd,
                                          initialWeights=array([1.0, 1.0]),
                                          iterations=10)
            LassoWithSGD.train(rdd,
                               initialWeights=array([1.0, 1.0]),
                               iterations=10)
            RidgeRegressionWithSGD.train(rdd,
                                         initialWeights=array([1.0, 1.0]),
                                         iterations=10)
        except ValueError:
            self.fail()

        # Verify that maxBins is being passed through
        GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4,
            maxBins=32)
        with self.assertRaises(Exception) as cm:
            GradientBoostedTrees.trainRegressor(
                rdd,
                categoricalFeaturesInfo=categoricalFeaturesInfo,
                numIterations=4,
                maxBins=1)
def evaluate_model_reg(test,model):
    valuesAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    return (MSE)


### LinearRegression with SGD
model_lreg_sgd_l2 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType="l2"))
model_lreg_sgd_l1 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType="l1"))
model_lreg_sgd_l0 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType=None))

### RidgeRegression
model_ridge = evaluate_model_reg(test_reg,RidgeRegressionWithSGD.train(training_reg,iterations=1000, step=0.0001))

### Lasso
model_lasso = evaluate_model_reg(test_reg,LassoWithSGD.train(training_reg,iterations=1000, step =0.0001))

#################### OUTPUTS #################################
print("Testing Error :"+"model_svm_l2 = " + str(model_svm_l2))
print("Testing Error :"+"model_svm_l1 = " + str(model_svm_l1))
print("Testing Error :"+"model_svm_l0 = " + str(model_svm_l0))

print("Testing Error :"+"model_log_lbfgs_l2 = " + str(model_log_lbfgs_l2))
print("Testing Error :"+"model_log_lbfgs_l1 = " + str(model_log_lbfgs_l1))
print("Testing Error :"+"model_log_lbfgs_l0 = " + str(model_log_lbfgs_l0))

print("Testing Error :"+"model_log_sgd_l2 = " + str(model_log_sgd_l2))
print("Testing Error :"+"model_log_sgd_l1 = " + str(model_log_sgd_l1))
print("Testing Error :"+"model_log_sgd_l0 = " + str(model_log_sgd_l0))

print("MSE Error :"+"model_lreg_sgd_l2 = " + str(model_lreg_sgd_l2))
Esempio n. 24
0
valuesAndPreds = testData.map(lambda p:
                              (p.label, model_ridge.predict(p.features)))
MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
test_cnt = valuesAndPreds.count()
ridge_RMSE_test = math.sqrt(MSE)

valuesAndPreds = trainData.map(lambda p:
                               (p.label, model_ridge.predict(p.features)))
MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
train_cnt = valuesAndPreds.count()
ridge_RMSE_train = math.sqrt(MSE)

# Lasso Regression
model_lasso = LassoWithSGD.train(trainData, regParam=0.01, intercept=True)

valuesAndPreds = testData.map(lambda p:
                              (p.label, model_lasso.predict(p.features)))
MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
test_cnt = valuesAndPreds.count()
lasso_RMSE_test = math.sqrt(MSE)

valuesAndPreds = trainData.map(lambda p:
                               (p.label, model_lasso.predict(p.features)))
MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
train_cnt = valuesAndPreds.count()
lasso_RMSE_train = math.sqrt(MSE)
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LassoWithSGD, LassoModel
from pyspark.mllib.util import MLUtils

sc = SparkContext(appName="PythonWordCount")
data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt')
traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt')
data_720 = MLUtils.loadLibSVMFile(sc,
                                  '/usr/hadoop/para_avg_halfsqrsum_720.txt')
data_540 = MLUtils.loadLibSVMFile(sc,
                                  '/usr/hadoop/para_avg_halfsqrsum_540.txt')
data_360 = MLUtils.loadLibSVMFile(sc,
                                  '/usr/hadoop/para_avg_halfsqrsum_360.txt')

model = LassoWithSGD.train(traindata)

predictions = model.predict(data.map(lambda x: x.features))
labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions)
MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(
    data.count())
print("training MSE = " + str(MSE))
labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_Lasso")
predictions_720 = model.predict(data_720.map(lambda x: x.features))
labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip(
    predictions_720)
MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) *
                                       (v - p)).sum() / float(data_720.count())
print("training MSE_720 = " + str(MSE_720))
labelsandpredictions_720.saveAsTextFile("/usr/hadoop/hf_720_Lasso")
predictions_540 = model.predict(data_540.map(lambda x: x.features))