Esempio n. 1
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
Esempio n. 2
0
def iterateRidge(iterNums, stepSizes, regParam, train, valid):
  from pyspark.mllib.regression import RidgeRegressionWithSGD
  import math
  for numIter in iterNums:
    for step in stepSizes:
      alg = RidgeRegressionWithSGD()
      model = alg.train(train, intercept=True, regParam=regParam, iterations=numIter, step=step)
      rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label))
      validPredicts = valid.map(lambda x: (model.predict(x.features), x.label))
      meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
Esempio n. 3
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
Esempio n. 4
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
def iterateRidge(iterNums, stepSizes, regParam, train, valid):
    from pyspark.mllib.regression import RidgeRegressionWithSGD
    import math
    for numIter in iterNums:
        for step in stepSizes:
            alg = RidgeRegressionWithSGD()
            model = alg.train(train,
                              intercept=True,
                              regParam=regParam,
                              iterations=numIter,
                              step=step)
            rescaledPredicts = train.map(lambda x:
                                         (model.predict(x.features), x.label))
            validPredicts = valid.map(lambda x:
                                      (model.predict(x.features), x.label))
            meanSquared = math.sqrt(
                rescaledPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean())
            meanSquaredValid = math.sqrt(
                validPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean())
            print("%d, %5.3f -> %.4f, %.4f" %
                  (numIter, step, meanSquared, meanSquaredValid))
Esempio n. 6
0
def RidgeRegressionModel(dataPath, label, normalize, character, master, ispca):

    pca_n = 2
    sc = SparkContext(master)
    data = sc.textFile(dataPath)

# not RDD data 

    ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))

    if label == 0:
        ndata = ndata.map(lambda line: line[::-1])

    if normalize == 1:
        test_data = norm(ndata.collect())    
        norm_data = sc.parallelize(test_data)
        train_data = norm_data.map(lambda part: lbp(part[0], part[1]))   
     #raw_data = data.map(lambda line: line.split(character))

    else:
        test_data = ndata.map(lambda part: (part[0], part[1:len(part) - 1])).collect()
        train_data = ndata.map(lambda part: lbp(part[0], part[1: len(part) - 1]))

    if ispca == 1:
        pca = PCA(n_components = pca_n)
        pca_train = [test_data[i][1] for i in range(len(test_data))]
        pca_data = pca.fit(pca_train).transform(pca_train)

        test = []
        for i in range(len(pca_data)):
            test.append([test_data[i][0], pca_data[i]])

        train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
        test_data = test
    

    model_rr = rr.train(train_data)
    err_rr = 0.0
    size = len(train_data.collect())

 
    for i in range(size):
        err_rr = err_rr + abs(model_rr.predict(test_data[i][1]) - test_data[i][0]) 
    
    print "result:", err_rr/size

    String = "Ridge Regression Result:\n"
    String = String + str(model_rr.weights) + '\n'
    String = String + "Error: " + str(err_rr / size)

    sc.stop() 

    return String
Esempio n. 7
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Esempio n. 8
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Esempio n. 9
0
def linearRegression_f(mode):
    if   mode == "no_reg":
         model = LinearRegressionWithSGD.train(parsedData)
    elif mode == "L1_reg":
         model = LassoWithSGD.train(parsedData)
    elif mode == "L2_reg":
         model = RidgeRegressionWithSGD.train(parsedData)
    else:
        print("ERROR Mode")
        
    #Evaluate the model on training data
    # parsedData map method to get {train_data, predict_data} pairs 
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    
    #calculate the key-value pairs to get MSE
    MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count()
    
  
    return MSE
Esempio n. 10
0
    def test_regression(self):
        from pyspark.mllib.regression import (
            LinearRegressionWithSGD,
            LassoWithSGD,
            RidgeRegressionWithSGD,
        )
        from pyspark.mllib.tree import DecisionTree

        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})),
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Esempio n. 11
0
	def exercise_2(self):
		"""
		# Write your Docstring here
		"""
		sc = self.spark_context
		file = sc.textFile("./carat-context-factors-percom.csv")
		energyRate,batteryHealth,batteryTemperature,batteryVoltage,\
			cpuUsage,distanceTraveled,mobileDataActivity,mobileDataStatus,\
			mobileNetworkType,networkType,roamingEnabled,screenBrightness,\
			wifiLinkSpeed,wifiSignalStrength = [i for i in range(0,14)]
		data = file.map(lambda line: line.split(";")).map(lambda line:
			(float(line[energyRate]),line[batteryHealth],
			float(line[batteryTemperature]),float(line[batteryVoltage]),
			float(line[cpuUsage]),float(line[distanceTraveled]),
			line[mobileDataActivity],line[mobileDataStatus],
			line[mobileNetworkType],line[networkType],
			float(line[roamingEnabled]),float(line[screenBrightness]),
			float(line[wifiLinkSpeed]),float(line[wifiSignalStrength])))
		data = data.filter(lambda x:((x[screenBrightness]==-1 or(x[screenBrightness]>=0 and x[screenBrightness]<=255)) and\
							(x[cpuUsage]>=0 and x[cpuUsage]<=1) and\
							(x[distanceTraveled]>=0) and\
							(x[wifiSignalStrength]>-100 and x[wifiSignalStrength]<0) and\
							(x[batteryTemperature]>=0)))
		data = data.map(lambda x:LabeledPoint(x[energyRate],
					[x[cpuUsage],x[screenBrightness], x[wifiSignalStrength], x[batteryTemperature]]))
		train,test = data.randomSplit([4,1])

		lr = LinearRegressionWithSGD.train(train,iterations=100,step=1e-4,intercept=False)
		print lr#(weights=[4.05918718288e-07,2.01710179227e-05,-3.39410603521e-05,1.70383825251e-05], intercept=0.0)
		rr = RidgeRegressionWithSGD.train(train,iterations=100,step=1e-4,intercept=False)
		print rr#(weights=[4.05918453228e-07,2.0170994023e-05,-3.39410381473e-05,1.70383716836e-05], intercept=0.0)
		l = LassoWithSGD.train(train,iterations=100,step=1e-4,intercept=False)
		print l#(weights=[0.0,1.96629057526e-05,-3.29054093642e-05,1.56445907401e-05], intercept=0.0)
		valuesAndPreds = test.map(lambda p: (p.label,lr.predict(p.features),
								rr.predict(p.features),l.predict(p.features)))
		count = valuesAndPreds.count()
		MSE = valuesAndPreds.map(lambda (v,lrp,rrp,lp): ((v - lrp)**2/count,
									(v - rrp)**2/count,(v - lp)**2/count))\
							.reduce(lambda a,b:(a[0]+b[0],a[1]+b[1],a[2]+b[2]))
		print MSE #(4.7634385303075644e-05, 4.7634387065855108e-05, 4.7873793406702168e-05)
		return None
Esempio n. 12
0
def main():
    records = get_records()
    records.cache()

    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))

    rr_model = RidgeRegressionWithSGD.train(data,
                                            iterations=10,
                                            step=0.1,
                                            intercept=False)
    true_vs_predicted_rr = data.map(lambda p:
                                    (p.label, rr_model.predict(p.features)))

    print "Ridge Regression Model predictions: " + str(
        true_vs_predicted_rr.take(5))

    calculate_print_metrics("Ridge Regression", true_vs_predicted_rr)
Esempio n. 13
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
print (training_reg.take(5))

def evaluate_model_reg(test,model):
    valuesAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    return (MSE)


### LinearRegression with SGD
model_lreg_sgd_l2 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType="l2"))
model_lreg_sgd_l1 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType="l1"))
model_lreg_sgd_l0 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType=None))

### RidgeRegression
model_ridge = evaluate_model_reg(test_reg,RidgeRegressionWithSGD.train(training_reg,iterations=1000, step=0.0001))

### Lasso
model_lasso = evaluate_model_reg(test_reg,LassoWithSGD.train(training_reg,iterations=1000, step =0.0001))

#################### OUTPUTS #################################
print("Testing Error :"+"model_svm_l2 = " + str(model_svm_l2))
print("Testing Error :"+"model_svm_l1 = " + str(model_svm_l1))
print("Testing Error :"+"model_svm_l0 = " + str(model_svm_l0))

print("Testing Error :"+"model_log_lbfgs_l2 = " + str(model_log_lbfgs_l2))
print("Testing Error :"+"model_log_lbfgs_l1 = " + str(model_log_lbfgs_l1))
print("Testing Error :"+"model_log_lbfgs_l0 = " + str(model_log_lbfgs_l0))

print("Testing Error :"+"model_log_sgd_l2 = " + str(model_log_sgd_l2))
print("Testing Error :"+"model_log_sgd_l1 = " + str(model_log_sgd_l1))
Esempio n. 15
0
def performRidgeRegression(training):
	model = RidgeRegressionWithSGD.train(training, iterations = 100, step = 0.001)
	return model
Esempio n. 16
0
#load and parse the data
def parsePoint(line):
    values = [np.float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[6], values[0:6])


data = sc.textFile("/user/cloudera/hw1/train_nohead.csv")
wholedata = sc.textFile("/user/cloudera/hw1/wholedata.csv")

parsedData = data.map(parsePoint)
parsedWholeData = wholedata.map(parsePoint)

#Build the model
model = RidgeRegressionWithSGD.train(parsedData,
                                     iterations=100,
                                     step=0.1,
                                     regParam=0.01)

#Evaluate the model
valuesAndPreds = parsedWholeData.map(lambda p:
                                     (p.label, model.predict(p.features)))

RMSE = np.sqrt(
    valuesAndPreds \
 .map(lambda (v, p): (v - p)**2) \
 .reduce(lambda x, y: x + y) / valuesAndPreds.count()
)
print("ridge regression output : \n")
print("RMSE = {0}\n".format(RMSE))

#save and load model
Esempio n. 17
0
                              (p.label, model_least.predict(p.features)))
MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
test_cnt = valuesAndPreds.count()
least_RMSE_test = math.sqrt(MSE)

valuesAndPreds = trainData.map(lambda p:
                               (p.label, model_least.predict(p.features)))
MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
train_cnt = valuesAndPreds.count()
least_RMSE_train = math.sqrt(MSE)

# Ridge Regression
model_ridge = RidgeRegressionWithSGD.train(trainData,
                                           regParam=0.01,
                                           intercept=True)

valuesAndPreds = testData.map(lambda p:
                              (p.label, model_ridge.predict(p.features)))
MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
test_cnt = valuesAndPreds.count()
ridge_RMSE_test = math.sqrt(MSE)

valuesAndPreds = trainData.map(lambda p:
                               (p.label, model_ridge.predict(p.features)))
MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
train_cnt = valuesAndPreds.count()
ridge_RMSE_train = math.sqrt(MSE)
Esempio n. 18
0
print(sameModel.predict(SparseVector(2, {
    0: 100.0,
    1: 150
})))  # 利用稀疏向量作为数据结构,返回单个预测值
test_set = []
for i in range(100):
    for j in range(100):
        test_set.append(SparseVector(2, {0: i, 1: j}))
print(sameModel.predict(sc.parallelize(test_set)).collect())  # 预测多值,返回一个RDD数据集
print(sameModel.weights)  # 返回参数

# -----------------岭回归------------------

from pyspark.mllib.regression import RidgeRegressionWithSGD

data = [
    LabeledPoint(1.0, [1.0, 1.0]),
    LabeledPoint(4.0, [1.0, 3.0]),
    LabeledPoint(8.0, [2.0, 3.0]),
    LabeledPoint(10.0, [3.0, 4.0])
]
train_set = sc.parallelize(data)
rrm = RidgeRegressionWithSGD.train(train_set,
                                   iterations=100,
                                   initialWeights=np.array([1.0, 1.0]))
test_set = []
for i in range(100):
    for j in range(100):
        test_set.append(np.array([i, j]))
print(rrm.predict(sc.parallelize(test_set)).collect())
print(rrm.weights)
Esempio n. 19
0
def modelSelection(argv):
	if len(argv) < 5:
		print("The arguments for this script require:\n" +
				"(hdfs or file):///path/to/filename of the dataset\n" +
				"supervised/unsupervised\n" +
				"classifier/regression/clustering\n" +
				"parameter trying to be guessed\n" +
				"other parameters\n")
	else:
		args = argv[1:]

		#sets up the RDD
		dataset = sc.textFile(args[0])
		params = args[3:]
		if args[0][-3:] == "csv":
			dataset = csvFilterAndMap(dataset, params)
			
		elif args[0][-4:] =="json":
			dataset = jsonFilterAndMap(dataset, params)

		else:
			print("This program only supports .csv and .json files")
			return
		#Model selection algorithm. Currently goes off of scikit learn's cheat sheet
		if args[1] == "supervised":
			labels = dataset.map(lambda x: x[0])
			values = dataset.map(lambda x: x[1:])
			zipped_data = labels.zip(values).map(lambda x: LabeledPoint(x[0], x[1:])).cache()
			datasetTraining, datasetTest = zipped_data.randomSplit([.8, .2])
			
			if args[2] == "classification":
				theModel = NaiveBayes.train(datasetTraining)

				test_preds = (datasetTest.map(lambda x: x.label).zip(theModel.predict(datasetTest.map(lambda x: x.features))))
				predictions = theModel.predict(datasetTest.map(lambda x: x.features))
				test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1]))))
				testing_accuracy = test_metrics.precision()
				
				with open('results.txt', 'w+') as f:
					f.write("accuracy: " + str(testing_accuracy) + "\n")
					f.write("confusion matrix:\n" + str(test_metrics.confusionMatrix().toArray()))
				return theModel

			elif args[2] == "regression":
				sample = zipped_data.sample(False, .3)
				model = performRegression(sample, params)
				if(model == "lasso"):
					theModel = LassoWithSGD.train(datasetTraining, iterations = 1000, step = 0.001)
				elif(model == "linear"):
					theModel = LinearRegressionWithSGD.train(datasetTraining, iterations = 1000, step = 0.001)
				else:
					theModel = RidgeRegressionWithSGD.train(datasetTraining, iterations = 1000, step = 0.001)
				test = (datasetTest.map(lambda x: x.label).zip(theModel.predict(datasetTest.map(lambda x: x.features))))
				metrics = RegressionMetrics(test.map(lambda x: (x[0], float(x[1]))))
				value = metrics.rootMeanSquaredError
				with open('results.txt', 'w+') as f:
					f.write(model +" root mean squared error: ")
					f.write(str(value))
				return theModel

			else:
				print("Please use rather classification or regression for supervised learning")
				return

		elif args[1] == "unsupervised":
			sample = dataset.sample(False, .3)
			with open('datapoints.txt', 'w+') as f:
				f.write("dataset:	" + str(dataset.take(10)))
				f.write('\n\n')

			if args[2] == "clustering":
				model = performClustering(sample, params)
				
				if(model[0] == "gaussian"):
					theModel = GuassianMixture.train(dataset, model[1])
				else:
					theModel = KMeans.train(dataset, model[1])
				with open('results.txt', 'w+') as f:
					f.write(str(model))
				return theModel
			else:
				print("Currently this model selection algorithm only supports clustering for unsupervised algorithms")
				return
from pyspark.mllib.regression import LabeledPoint, RidgeRegressionWithSGD
from numpy import array
from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(";")]
    return LabeledPoint(values[11], values[0:10])


sc = SparkContext("local", "Simple App")
data = sc.textFile("../winequality.csv")
parsedData = data.map(parsePoint)

# Build the model
model = RidgeRegressionWithSGD.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))
import sys
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import RidgeRegressionWithSGD, RidgeRegressionModel
from pyspark.mllib.util import MLUtils

sc = SparkContext(appName="PythonWordCount")
data=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt')
traindata=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt')
data_720=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_720.txt')
data_540=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_540.txt')
data_360=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_360.txt')

model = RidgeRegressionWithSGD.train(traindata)

predictions = model.predict(data.map(lambda x:x.features))
labelsandpredictions=data.map(lambda lp: lp.label).zip(predictions)
MSE = labelsandpredictions.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data.count())
print("training MSE = "+str(MSE))
labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_Ridge")
predictions_720 = model.predict(data_720.map(lambda x:x.features))
labelsandpredictions_720=data_720.map(lambda lp: lp.label).zip(predictions_720)
MSE_720 = labelsandpredictions_720.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_720.count())
print("training MSE_720 = "+str(MSE_720))
labelsandpredictions_720.saveAsTextFile("/usr/hadoop/hf_720_Ridge")
predictions_540 = model.predict(data_540.map(lambda x:x.features))
labelsandpredictions_540=data_540.map(lambda lp: lp.label).zip(predictions_540)
MSE_540 = labelsandpredictions_540.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_540.count())
print("training MSE_540 = "+str(MSE_540))
labelsandpredictions_540.saveAsTextFile("/usr/hadoop/hf_540_Ridge")
predictions_360 = model.predict(data_360.map(lambda x:x.features))
import sys
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import RidgeRegressionWithSGD, RidgeRegressionModel
from pyspark.mllib.util import MLUtils

sc = SparkContext(appName="PythonWordCount")
data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim.txt')
traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_ssim.txt')
data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_720.txt')
data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_540.txt')
data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_360.txt')

model = RidgeRegressionWithSGD.train(traindata)

predictions = model.predict(data.map(lambda x: x.features))
labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions)
MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(
    data.count())
print("training MSE = " + str(MSE))
labelsandpredictions.saveAsTextFile("/usr/hadoop/ssim_Ridge")
predictions_720 = model.predict(data_720.map(lambda x: x.features))
labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip(
    predictions_720)
MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) *
                                       (v - p)).sum() / float(data_720.count())
print("training MSE_720 = " + str(MSE_720))
labelsandpredictions_720.saveAsTextFile("/usr/hadoop/ssim_720_Ridge")
predictions_540 = model.predict(data_540.map(lambda x: x.features))
labelsandpredictions_540 = data_540.map(lambda lp: lp.label).zip(
    predictions_540)
Esempio n. 23
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd,
                                          initialWeights=array([1.0, 1.0]),
                                          iterations=10)
            LassoWithSGD.train(rdd,
                               initialWeights=array([1.0, 1.0]),
                               iterations=10)
            RidgeRegressionWithSGD.train(rdd,
                                         initialWeights=array([1.0, 1.0]),
                                         iterations=10)
        except ValueError:
            self.fail()

        # Verify that maxBins is being passed through
        GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4,
            maxBins=32)
        with self.assertRaises(Exception) as cm:
            GradientBoostedTrees.trainRegressor(
                rdd,
                categoricalFeaturesInfo=categoricalFeaturesInfo,
                numIterations=4,
                maxBins=1)
        y_predict = []

        if con.HDF == False:
            sc = SparkContext(appName="VolEstimation")

        #post-process input in pandas
        postProcess(con.DATA_PATH + con.FILE_NM, vol=7)

        xy_test_points = pd.read_csv(
            os.path.abspath(os.curdir) + '/data/' + con.TEST_FN)
        x_test_points = xy_test_points.ix[:, 1:11].values.tolist(
        )  #omit y output col
        y_test_points = xy_test_points.ix[:, 0].values.tolist()

        xy_train_points = sc.textFile(
            os.path.abspath(os.curdir) + '/data/' +
            con.TRAIN_FN).map(parsePoint)

        model = RidgeRegressionWithSGD.train(xy_train_points, iterations=5000)

        for x in x_test_points:
            y_predict.append(model.predict(x))
        r2_knn = r2_score(y_test_points, y_predict)

        print "Final Out of Sample R^2 of Regression" + str(r2_knn)
        print "Final weights: " + str(model.weights)
        print "Final intercept: " + str(model.intercept)

        #kill Spark context gracefully
        sc.stop()