def do_all(f_path,out_name):
	sc = SparkContext()
	data = sc.textFile(f_path)

	data = data.map(parseKeepD).filter(lambda p: p[0] != None)

	# Scale Features
	features = data.map(lambda x: x[0].features)
	summary = Statistics.colStats(features)
	global means
	global varis
	means = summary.mean()
	varis = summary.variance()

	#scale the points
	data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))

	#train model
	model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')

	#calculate disparity
	disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))  

	#calculate SSR for later
	ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()

	#keep N
	N = disparity.count()
	#shut down SC
	MSE = ssr/float(N)
	se = std_errors(data,MSE,N)
	disparity.saveAsTextFile(out_loc + out_name)

	sc.stop()
	return model.intercept,model.weights,se,disparity, ssr, N
Beispiel #2
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
def iterateLRwSGDBatch(iterNums, stepSizes, fractions, train, valid):
  for numIter in iterNums:
    for step in stepSizes:
      for miniBFraction in fractions:
        alg = LinearRegressionWithSGD()
        model = alg.train(train, intercept=True, iterations=numIter, step=step, miniBatchFraction=miniBFraction)
        rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label))
        validPredicts = valid.map(lambda x: (model.predict(x.features), x.label))
        meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
        meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
        print("%d, %5.3f %5.3f -> %.4f, %.4f" % (numIter, step, miniBFraction, meanSquared, meanSquaredValid))
def iterateLRwSGD(iterNums, stepSizes, train, valid):
  from pyspark.mllib.regression import LinearRegressionWithSGD
  import math
  for numIter in iterNums:
    for step in stepSizes:
      alg = LinearRegressionWithSGD()
      model = alg.train(train, iterations=numIter, step=step, intercept=True)
      rescaledPredicts = train.map(lambda x: (float(model.predict(x.features)), x.label))
      validPredicts = valid.map(lambda x: (float(model.predict(x.features)), x.label))
      meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
Beispiel #5
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
Beispiel #6
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
def regression():
    #Regression Point
    #Reads the data from the joinedResults directory as a parquet file
    datadf = sqlContext.read.parquet(output+"/joinedResults")
    datadf.show()
    data = datadf.rdd.map(lambda w: (float(w.avg_prcp), int(w.yy), float(w.latitude), float(w.longitude)))
    max_prcp = data.max()
    min_prcp = data.min()
    lat = data.map(lambda x: (x[2])).cache()
    min_lat = lat.min()
    max_lat = lat.max()

    longt =  data.map(lambda x: (x[3])).cache()
    min_long = longt.min()
    max_long = longt.max()
    
    max_ = [max_prcp[0], float(2050), max_lat, max_long]
    min_ = [min_prcp[0], float(1990), min_lat, min_long]
    # change the format to fit in LinearRegression library
    parsedData = data.map(lambda x: parsePointPrediction(x, max_, min_)).cache()
    # Split data aproximately into training (80%) and test (20%)
    trainData, testData = parsedData.randomSplit([0.8, 0.2], seed = 0)
    trainData.cache()
    testData.cache()
    # Build the model using Try and error to find out the Parameters.
    model = LinearRegressionWithSGD.train(trainData, iterations =500, regType="l2", regParam=10, intercept="true"  )
    # Evaluate the model on test data
    valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
    maxVal=max_prcp[0]

    model.save(sc, output+"/modelpath")
    return
def evaluate(train,test,iterations,step,regParam,regType,intercept):
    model = LinearRegressionWithSGD.train(train, iterations, step,regParam=regParam, regType=regType, intercept=intercept)
    tp = test.map(lambda p: (p.label, model.predict(p.features)))
    rmse = np.sqrt(tp.map(lambda (t,p): squarred_error(t,p)).mean())
    mae = np.sqrt(tp.map(lambda (t,p): abs_error(t,p)).mean())
    rmsle = np.sqrt(true_vs_predicted.map(lambda (t,p): squared_log_error(t,p)).mean())
    opt_metrics = [rmse,mae,rmsle] 
    return opt_metrics
def get_best_result(best_step_size, training_lp, testing_lp, iterations):
    model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=best_step_size, regType = 'l2')
    values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features)))
    MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
    RMSE = math.sqrt(MSE)

    result_str = 'best step size got by cross validation cv: ' + str(best_step_size) + ', lowest RMSE: ' + str(RMSE)
    return result_str
Beispiel #10
0
def getRMSE(step_array):
	valRMSE_list = []
	for step in step_array:
		model = LinearRegressionWithSGD.train(train_featureScoreTimeRDD, iterations=5000, step=step)
		labelsAndPreds = val_featureScoreTimeRDD.map(lambda p: (p.label, model.predict(p.features)))
		valMSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / val_featureScoreTimeRDD.count()
		valRMSE=valMSE**0.5
		valRMSE_list.append((step, valRMSE))
	return valRMSE_list
def linearRegression(features,sc,output_n):
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:116]
	
	
	linearregression_model = LinearRegressionWithSGD.train(training_data,iterations=0,regParam=200)
	prediction = testing_data.map(lambda line: (line.label, linearregression_model.predict(line.features)))
	return linearregression_model,prediction
Beispiel #12
0
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca):

    pca_n = 2
    sc = SparkContext(master)
    data = sc.textFile(dataPath)

# not RDD data 

    ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))

    if label == 0:
        ndata = ndata.map(lambda line: line[::-1])

    if normalize == 1:
        test_data = norm(ndata.collect())    
        norm_data = sc.parallelize(test_data)
        train_data = norm_data.map(lambda part: lbp(part[0], part[1]))   
     #raw_data = data.map(lambda line: line.split(character))


    else:
        test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect()
        train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1]))
    
    
    if ispca == 1:
        pca = PCA(n_components = pca_n)
        pca_train = [test_data[i][1] for i in range(len(test_data))]
        pca_data = pca.fit(pca_train).transform(pca_train)

        test = []
        for i in range(len(pca_data)):
            test.append([test_data[i][0], pca_data[i]])

        train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
        test_data = test
            

    model_lr = lr.train(train_data)
    err_lr = 0.0
    size = len(train_data.collect())
   
    for i in range(size):
        err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0])
           

    print "result:", err_lr/size

    String = "Linear Regression Result:\n"
    String = String + str(model_lr.weights) + '\n'
    String = String + "Error: " + str(err_lr / size) 
    
    sc.stop()

    return String
Beispiel #13
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
def get_best_stepsize(step_sizes, training_lp, testing_lp, iterations):
    best_stepsize = 0
    lowest_RMSE = float("inf")
    for step_size in step_sizes:
        model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=step_size)
        values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features)))
        MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
        RMSE = math.sqrt(MSE)
        if RMSE < lowest_RMSE:
            lowest_RMSE = RMSE
            best_stepsize = step_size

    result_str = 'best step size: ' + str(best_stepsize) + ', lowest RMSE: ' + str(lowest_RMSE)
    return result_str
def LinearRegression(filename, sc):
	filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
	data = sc.textFile(filename)
	parsedData = data.map(parsePoint)

	# train the model
	model = LinearRegressionWithSGD.train(parsedData)

	# Evaluate the model on training data
	valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
	print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")

	# Save and load model
	#model.save(sc, "myModelPath")
	#sameModel = LinearRegressionModel.load(sc, "myModelPath")
Beispiel #16
0
def algo(a):
    global data
    global week 
    global target
    test = week 
    week_target = week.map(convert)
    #apply(convert, axis=1)
    #np.random.seed(123)
    data_final = LabeledPoint(target, data)
    #make rdd that is input for algo 


    if a == 'sgd':
        #time_0 = time.time()
        lrm = LinearRegressionWithSGD.train(sc.parallelize(data_final), iterations=10, initialWeights=np.array([1.0]))
        print (abs(lrm.predict(test)))
        print time.time() - time_0 
Beispiel #17
0
def test_spark():
    def parsePoint(line):
        values = [float(x) for x in line.replace(',', ' ').split(' ')]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile(r"/usr/local/Cellar/apache-spark/1.6.1/libexec/data/mllib/ridge-data/lpsa.data")
    parsedData = data.map(parsePoint)
    print parsedData.collect()

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData)

    # Evaluate the model on training data
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
    print "Model coefficients:", str(model)
def linearRegression(features,sc,output_n):
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:]

	labeled_training = []
	labeled_testing = []
	for x in training_features_labels:
		labeled_training.append(LabeledPoint(x[0],x[1]))

	for y in testing_features_labels:
		labeled_testing.append(LabeledPoint(y[0],y[1]))

	test = sc.parallelize(labeled_testing)

	linearregression_model = LinearRegressionWithSGD.train(labeled_training,iterations=0,regParam=200)
	predictions = test.map(lambda line: (line.label, float(linearregression_model.predict(line.features))))
	return predictions
def linearRegression_f(mode):
    if   mode == "no_reg":
         model = LinearRegressionWithSGD.train(parsedData)
    elif mode == "L1_reg":
         model = LassoWithSGD.train(parsedData)
    elif mode == "L2_reg":
         model = RidgeRegressionWithSGD.train(parsedData)
    else:
        print("ERROR Mode")
        
    #Evaluate the model on training data
    # parsedData map method to get {train_data, predict_data} pairs 
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    
    #calculate the key-value pairs to get MSE
    MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count()
    
  
    return MSE
def LinearRegression(trainFile, testFile, taskid,sc):
	# filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
	# data = sc.textFile(filename)
	# parsedData = data.map(parsePoint)

	trainData = MLUtils.loadLibSVMFile(sc, trainFile)
	testData = MLUtils.loadLibSVMFile(sc, testFile)

	# train the model
	model = LinearRegressionWithSGD.train(trainData)

	# Evaluate the model on training data
	# predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features)))
	MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count()
	print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")

	# Save and load model
	#model.save(sc, "myModelPath")
	#sameModel = LinearRegressionModel.load(sc, "myModelPath")
def get_best_stepsize(step_sizes, training_lp, iterations, cv_trails):
    best_stepsize = 0
    lowest_RMSE = float("inf")
    num_folds = 4
    fold_set = [1]*num_folds
    cv_data = training_lp.randomSplit(fold_set) # 4 folds
    for step_size in step_sizes:
        total_RMSE = 0.0
        for i in range(num_folds):
            cv_testing = cv_data[i]
            cv_training = training_lp.subtract(cv_testing)
            model = LinearRegressionWithSGD.train(cv_training, iterations=iterations, step=step_size)
            values_and_preds = cv_testing.map(lambda p: (p.label, model.predict(p.features)))
            MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
            RMSE = math.sqrt(MSE)
            total_RMSE += RMSE
        avg_RMSE = total_RMSE/cv_trails
        if avg_RMSE < lowest_RMSE:
            lowest_RMSE = avg_RMSE
            best_stepsize = step_size

    return best_stepsize
    def train_amount_model(self, model, data, i):
        rdd_data = self.sc.parallelize(data)
        self.logger.info('Start to train the amount model')
        if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK:
            input_num = self.feature_num
            layers = [input_num, input_num / 3 * 2, input_num / 3, 1]

            neural_network = NeuralNetworkSpark(layers=layers, bias=0)
            model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001,
                                         iteration=15, model=model)
        elif self.amount_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40,
                                                featureSubsetStrategy="auto", impurity='variance', maxDepth=20,
                                                maxBins=32)

        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                  initialWeights=model.weights if model is not None else None)

        else:
            self.logger.error("Unknown training method {}".format(self.amount_prediction_method))
            raise ValueError("Unknown training method {}".format(self.amount_prediction_method))
        return model
Beispiel #23
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
def learn_model(sc, file_path, normalize):
	feature_file = sc.textFile(file_path).map(lambda l:l.split("\t"))

	points = feature_file.map(lambda f: LabeledPoint(f[1], f[2:]))
	
	#normalizing
	if normalize:
		nor      = Normalizer()
		labels   = points.map(lambda x: x.label)
		features = points.map(lambda x: x.features)
		points = labels.zip(nor.transform(features))
		points = points.map(lambda i: LabeledPoint(i[0], i[1]))

	training, testing = points.randomSplit([0.7,0.3],11)
	index = 0
	iterations = 100
	p_mse = -1
	converge = False
	result = {}
	while(not converge):
		x = time.clock()
		model = LinearRegressionWithSGD.train(training, iterations=iterations, step=0.00001,intercept=True,regType="l1")
		y = time.clock()
		print("========== time = " + str(y - x))
		preds = testing.map(lambda p: (p.label, model.predict(p.features)))
		MSE = preds.map(lambda r: (r[1] - r[0])**2).reduce(lambda x, y: x + y) / preds.count()
		print("========== MSE = " + str(MSE))
		if p_mse == MSE:
			converge = True

		iterations = iterations +100
		result[iterations] = MSE
		p_mse = MSE
	
	print(result)
	return model
Beispiel #25
0
                                          regType=reg_type,
                                          intercept=intercept)
    # use test data -> rdd: [(actual_value, prdict_value), (...), (...), ......]
    tlabel_tprediction = train_set.map(
        lambda point: (point.label, model.predict(point.features)))
    # calculate Root Mean Squared Log Error
    rmsle = np.sqrt(
        tlabel_tprediction.map(
            lambda tp: squared_log_error(tp[0], tp[1])).mean())
    return rmsle


if __name__ == '__main__':
    # create linear model and test
    linear_model = LinearRegressionWithSGD.train(data,
                                                 iterations=200,
                                                 step=0.05,
                                                 intercept=False)
    linear_model.save(
        sc,
        'PricePrediction/model/LR.model')  # save the trained model to local
    true_vs_predicted = data.map(
        lambda point: (point.label, linear_model.predict(point.features)))
    print('线性回归模型对前5个样本的预测值: ' + str(true_vs_predicted.take(5)))  # test
    '''
    same_md = LinearRegressionModel.load(sc, 'PricePrediction/model/LR.model')
    true_vs_predicted = data.map(lambda point: (point.label, same_md.predict(point.features)))
    print(str(true_vs_predicted.take(2)))
    '''

    # error analysis
    m_s_e = true_vs_predicted.map(
Beispiel #26
0
data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k))
test = data_with_idx.sample(False, 0.1, 100)
train = data_with_idx.subtractByKey(test)
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p): p)
train_size = train_data.count()
test_size = test_data.count()
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)

# make the linear regression horsepower model

linear_model_hp = LinearRegressionWithSGD.train(train_data,
                                                iterations=100,
                                                step=0.0000001,
                                                intercept=False)
linear_model_hp

# make predictions and measure error
true_vs_predicted = test_data.map(
    lambda p: (p.label, linear_model_hp.predict(p.features)))


def squared_error(actual, pred):
    return (pred - actual)**2


def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2
# In[77]:

from pyspark.mllib.regression import LinearRegressionWithSGD
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept


# In[79]:

# TODO: Replace <FILL IN> with appropriate code
firstModel = LinearRegressionWithSGD.train(parsedTrainData, numIters, alpha, miniBatchFrac, None, reg, regType, useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print weightsLR1, interceptLR1


# In[80]:

# TEST LinearRegressionWithSGD (4a)
expectedIntercept = 13.3335907631
expectedWeights = [16.682292427, 14.7439059559, -0.0935105608897, 6.22080088829, 4.01454261926, -3.30214858535,
                   11.0403027232, 2.67190962854, 7.18925791279, 4.46093254586, 8.14950409475, 2.75135810882]
Test.assertTrue(np.allclose(interceptLR1, expectedIntercept), 'incorrect value for interceptLR1')
Test.assertTrue(np.allclose(weightsLR1, expectedWeights), 'incorrect value for weightsLR1')
Beispiel #28
0
STEP_SIZE = 0.00000001


def parse_data(line):
    data = list(map(lambda n: float(n), line.replace(',', ' ').split(' ')))
    return LabeledPoint(data[0], Vectors.dense(data[0], data[len(data) - 1]))

# # #    #   #    #   #    #   #    # # #    #   #    # # #
#   #    #   #    ##  #    ##  #      #      ##  #    #
# #      #   #    # # #    # # #      #      # # #    #  ##
#  #     #   #    #  ##    #  ##      #      #  ##    #   #
#   #    # # #    #   #    #   #    # # #    #   #    # # #

sc = SparkContext("local", "linear_regression_app")

file_content = sc.textFile(FILE_PATH).cache()
print(f'file_content.count = { file_content.count() }')

data = file_content.map(parse_data).cache()
print(f'data.count = { data.count() }')

model = LinearRegressionWithSGD.train(data, NUM_ITERATIONS, STEP_SIZE)
predictions = data.map(lambda point:
                       (point.label, model.predict(point.features)))

predictions.foreach(
    lambda point: print(f"Predicted: { point[0] }\t| Actual: { point[1] }"))

mse = predictions.map(lambda point: pow((point[0] - point[1]), 2)).mean()
print(f'Training Mean Squared Error = { mse }')
    CVTrainData = ZippedData.filter(lambda tup: tup[1]<int(TSize*i) or tup[1]>int(TSize*(i+1))).map(lambda x:x[0])
    CVTestData = ZippedData.filter(lambda tup: tup[1]>int(TSize*i) and tup[1]<int(TSize*(i+1))).map(lambda x:x[0])
    model = LinearRegressionWithSGD.train(CVTrainData, iterations=10000, step=0.01, regType='l1', regParam=0.1)
    values_and_preds = CVTestData.map(lambda p: (p.label, model.predict(p.features)))
    RMSE = sqrt(values_and_preds.map(lambda vp: (vp[0] - vp[1])**2).reduce(lambda x, y: x + y)/values_and_preds.count())
    total_rmse += RMSE
    MAE = values_and_preds.map(lambda vp: abs(vp[0] - vp[1])).reduce(lambda x, y: x + y)/values_and_preds.count()
    total_mae += MAE
    print(RMSE)
    print(MAE)

print("Avg Root Mean Squared Error on CV = " + str(total_rmse/folds))
print("Avg Mean Absolute Error on CV = " + str(total_mae/folds))
"""

test_model = LinearRegressionWithSGD.train(parsed_train_data,
                                           iterations=10000,
                                           step=0.01,
                                           regType='l1',
                                           regParam=0.1)
values_and_preds = parsed_test_data.map(
    lambda p: (p.label, test_model.predict(p.features)))
TestRMSE = sqrt(
    values_and_preds.map(lambda vp:
                         (vp[0] - vp[1])**2).reduce(lambda x, y: x + y) /
    values_and_preds.count())
print("Root Mean Squared Error on Test Data = " + str(TestRMSE))
TestMAE = values_and_preds.map(lambda vp: abs(vp[0] - vp[1])).reduce(
    lambda x, y: x + y) / values_and_preds.count()
print("TMean Absolute Error on Test Data = " + str(TestMAE))
Beispiel #30
0
                                      categoricalFeaturesInfo={},
                                      numTrees=5,
                                      impurity='variance',
                                      maxDepth=4,
                                      maxBins=32)
predictionsRF = modelRF.predict(testData.map(lambda x: x.features))

#Gradient Boosted Model
modelGB = GradientBoostedTrees.trainRegressor(trainingData,
                                              categoricalFeaturesInfo={},
                                              numIterations=3)
predictionsGB = modelGB.predict(testData.map(lambda x: x.features))

#Linear Regression Model
modelLin = LinearRegressionWithSGD.train(trainingData,
                                         iterations=100,
                                         step=0.00000001)
predictionsLin = modelLin.predict(testData.map(lambda x: x.features))

resultsRF = predictionsRF.collect()
resultsGB = predictionsGB.collect()
resultsLin = predictionsLin.collect()
testDataList = testData.collect()

#iterator for the test data array
count = 0
print("Random Forest")
for item in resultsRF:
    #Retrieve the actual salary from the labeledpoint
    salaryMatch = testDataList[count].label
    #Find the player who has this same salaryc
if __name__ == "__main__":
    sc = SparkContext(appName="Regression Metrics Example")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = line.split()
        return LabeledPoint(float(values[0]),
                            DenseVector([float(x.split(':')[1]) for x in values[1:]]))

    data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
    parsedData = data.map(parsePoint)

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData)

    # Get predictions
    valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))

    # Instantiate metrics object
    metrics = RegressionMetrics(valuesAndPreds)

    # Squared Error
    print("MSE = %s" % metrics.meanSquaredError)
    print("RMSE = %s" % metrics.rootMeanSquaredError)

    # R-squared
    print("R-squared = %s" % metrics.r2)

    # Mean absolute error
Beispiel #32
0
    train_scaled = getScaledData(train)
    train_val_scaled = getScaledData(train_val)
    test_scaled = getScaledData(test)

    train.cache()
    train_scaled.cache()
    train_val.cache()
    train_val_scaled.cache()
    test.cache()
    test_scaled.cache()

    iter = 10**4
    step = 10**(-5)

    model = LinearRegressionWithSGD.train(train, iter, step)  # iter, step size

    # predict
    predictions_val = model.predict(train_val_scaled.map(lambda x: x.features))
    labelsAndPreds_val = train_val_scaled.map(lambda lp: lp.label).zip(
        predictions_val).map(lambda (a, b): (b, a))

    predictions = model.predict(test_scaled.map(lambda x: x.features))
    labelsAndPreds = test_scaled.map(lambda lp: lp.label).zip(predictions).map(
        lambda (a, b): (b, a))

    result = open('hw4.txt', 'a')
    result.write('---------------\n')
    result.write('Validation\n')
    result.write('MAE: %.5f\n' % getMAE(labelsAndPreds_val))
    result.write('RMSE: %.5f\n\n' % getRMSE(labelsAndPreds_val))
Beispiel #33
0
	label = clean_line_split[10]
	nonlabel = clean_line_split[0:10] + clean_line_split[11:]
	
	return LabeledPoint(label, nonlabel) 

data_file = sc.textFile("s3://aws-logs-012060642840-us-west-2/elasticmapreduce/cloud_proj/00-08.csv").cache ()
header = data_file.first ()
raw_data = data_file.filter (lambda x:x != header)

parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = LinearRegressionWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()

# Evaluating the model on training data
valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds \
    .map(lambda (v, p): (v - p)**2) \
    .reduce(lambda x, y: x + y) / valuesAndPreds.count()
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Total Time: '), (datetime.now() - startTime)

print("Mean Squared Error = " + str(MSE))
# Save and load model
model.save(sc, "LinearRegressionNarrow00-08_cache_both_train_and_test")
sameModel = LinearRegressionModel.load(sc, "LinearRegressionNarrow00-08_cache_both_train_and_test")
Beispiel #34
0
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d" % num_data
print "Train + Test size: %d" % (test_size + train_size)

df = records.map(lambda line: Row(Displacement=line[2], Horsepower=line[6])).toDF()
df.show(10)

df = df.select('Horsepower', 'Displacement')
df = df[df.Displacement > 0]
df = df[df.Horsepower > 0]
df.describe(['Horsepower', 'Displacement']).show()
temp = df.map(lambda line: LabeledPoint(line[0], [line[1:]]))
temp.take(5)

linearModel = LinearRegressionWithSGD.train(temp, 10000, 0.0001, intercept=False)
linearModel.weights

test_data.take(10)

true_vs_predicted = temp.map(lambda p: (p.label, linearModel.predict(p.features)))
print "Linear Model predictions: " + str(true_vs_predicted.take(100))


def squared_error(actual, pred):
    return (pred - actual) ** 2


def abs_error(actual, pred):
    return np.abs(pred - actual)
Beispiel #35
0
dataset = db_client.iestimate.predictions81k_ecp_copy.find(
    {"postalcode": "01772"})

dataset = dsto_norm_labeled_points(dataset, features_regression_model)

dataset = sc.parallelize(dataset)

# Load and parse the data
# def parsePoint(line):
#     values = [float(x) for x in line.replace(',', ' ').split(' ')]
#     return LabeledPoint(values[0], values[1:])
#
# data = sc.textFile("data/mllib/ridge-data/lpsa.data")
# parsedData = data.map(parsePoint)
processed_data = dataset

# Build the model
model = LinearRegressionWithSGD.train(processed_data,
                                      iterations=300,
                                      step=0.01)

# Evaluate the model on training data
valuesAndPreds = processed_data.map(lambda p:
                                    (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

# Save and load model
model.save(sc, "myModelPath")
sameModel = LinearRegressionModel.load(sc, "myModelPath")
Beispiel #36
0
from pyspark.mllib.regression import LinearRegressionWithSGD, LabeledPoint


def textParser(type):
    """
    type : 0 the lowest prices and 1 is the average price
    """
    datas = []
    lines = open('cards2.txt')
    for line in lines:
        features = line.strip().split('\t')
        datas.append(LabeledPoint(float(features[type]), features[2:-1]))
    return datas

if __name__ == '__main__':
    sc = SparkContext()
    datas = sc.parallelize(textParser(1))
    model = LinearRegressionWithSGD.train(datas, step=0.00000000174434, iterations=2000, regType='l2')
    # model = LinearRegressionWithSGD.train(datas, step=0.00000000175234766555555566666, iterations=5000, regType='l2')
    print '**' * 50
    print model.weights
    print model.intercept
    print model.predict(array([9409, 187533, 84500, 84572]))
    print '**' * 50
    valuesAndPreds = datas.map(lambda p: (p.label, model.predict(p.features)))
    print valuesAndPreds.collect(), valuesAndPreds.count()
    MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
    # 2016.1  9409    82200   82352   187533
    sc.stop()
Beispiel #37
0
from pyspark.mllib.regression import LinearRegressionWithSGD
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept

# In[145]:

# TODO: Replace <FILL IN> with appropriate code
firstModel = LinearRegressionWithSGD.train(data=parsedTrainData,
                                           iterations=numIters,
                                           step=alpha,
                                           miniBatchFraction=miniBatchFrac,
                                           initialWeights=None,
                                           regParam=reg,
                                           regType=regType,
                                           intercept=useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print weightsLR1, interceptLR1

# In[146]:

# TEST LinearRegressionWithSGD (4a)
expectedIntercept = 13.3335907631
expectedWeights = [
    16.682292427, 14.7439059559, -0.0935105608897, 6.22080088829,
Beispiel #38
0
    def test_regression(self):
        from pyspark.mllib.regression import (
            LinearRegressionWithSGD,
            LassoWithSGD,
            RidgeRegressionWithSGD,
        )
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees

        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2]),
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4
        )
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1
        )
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4
        )
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()

        # Verify that maxBins is being passed through
        GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32
        )
        with self.assertRaises(Exception):
            GradientBoostedTrees.trainRegressor(
                rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1
            )
Beispiel #39
0
total_cnt = type_cnt + number_cnt
#print type_cnt


def extract_features(fields):
    step = 0
    features = np.zeros(total_cnt)
    for t_idx in type_columns:
        features[step + int(type_maps[t_idx][fields[t_idx]])] = 1.0
        step = step + len(type_maps[t_idx])
    for n_idx in number_columns:
        features[step] = float(fields[n_idx])
        step = step + 1
    return features


data = raw_data.map(lambda fields: LabeledPoint(
    float(fields[saleprice_column]), extract_features(fields)))

#first_point= data.first()
#print "label of first point: %f" % first_point.label
#print "features of first point: %s" % str(first_point.features)
#print "feature vector length: %d" % len(first_point.features)

lrModel = LinearRegressionWithSGD.train(data,
                                        iterations=10,
                                        step=0.1,
                                        intercept=False)
actual_vs_pred = data.map(lambda p: (p.label, lrModel.predict(p.features)))
print actual_vs_pred.take(10)
sc = SparkContext()

selcol = [1, 3, 4, 6, 18, 23, 25]
train = prep_Data("HW4/200[3-7].csv", selcol)
test = prep_Data("HW4/2008.csv", selcol)

#transform data into the format that can be feed into model
trainLabeled = train.map(
    lambda line: LabeledPoint(extract_label(line), extract_features(line)))
testLabeled = test.map(
    lambda line: LabeledPoint(extract_label(line), extract_features(line)))

#preserver some part of the data as validation data
train_dataset, val_dataset = trainLabeled.randomSplit([0.7, 0.3])

#train
linear_model_val = LinearRegressionWithSGD.train(train_dataset, 100000,
                                                 0.00000000001)
linear_model = LinearRegressionWithSGD.train(trainLabeled, 100000,
                                             0.00000000001)

#evaluateModel(linear_model_val, val_dataset)
#evaluateModel(linear_model, testLabeled)

#evaluate data
mae_val, rmse_val = evaluateModel(linear_model_val, val_dataset)
mae, rmse = evaluateModel(linear_model, testLabeled)

print "Validation: \n" + "MAE: " + str(mae_val) + "\nRMSE: " + str(rmse_val)
print "\nTest: \n" + "MAE: " + str(mae) + "\nRMSE: " + str(rmse)
# In[7]:

from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.tree import DecisionTree
help(LinearRegressionWithSGD.train)

# In[8]:

help(DecisionTree.trainRegressor)

# ## Train a Regression Model on the Bike Sharing Dataset

# In[9]:

linear_model = LinearRegressionWithSGD.train(data,
                                             iterations=10,
                                             step=0.1,
                                             intercept=False)
true_vs_predicted = data.map(lambda p:
                             (p.label, linear_model.predict(p.features)))
print "Linear Model predictions: " + str(true_vs_predicted.take(5))

# In[10]:

# we pass in an mepty mapping for categorical feature size {}
dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())
Beispiel #42
0
pydf = DataFrame({'x':x,'y':y})
p = ggplot(pydf, aes('x','y')) + \
    geom_point(color='blue') 
display(p)

# COMMAND ----------

# MAGIC %md ## Linear Regression with SGD
# MAGIC * Load and parse the data where y = Median Housing Price (values[1]) and x = Population (values[0])
# MAGIC * Building two example models
# MAGIC * Reference pyspark MLLib regression
# MAGIC * * http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.regression

# COMMAND ----------

modelA = LinearRegressionWithSGD.train(parseddata, iterations=100, step=0.01, intercept=True)
modelB = LinearRegressionWithSGD.train(parseddata, iterations=1500, step=0.1, intercept=True)

# COMMAND ----------

print ">>>> ModelA intercept: %r, weights: %r" % (modelA.intercept, modelA.weights)

# COMMAND ----------

print ">>>> ModelB intercept: %r, weights: %r" % (modelB.intercept, modelB.weights)

# COMMAND ----------

# MAGIC %md ## Evaluate the Model
# MAGIC #### Predicted vs. Actual
Beispiel #43
0
    # get the data from each stock csv file
    stocks = sc.textFile("hdfs:///shared/financial_data/stocks/permno_csv/" +
                         selected_file)
    stocks = stocks.mapPartitions(lambda x: csv.reader(x))
    # map and filter the data to (stock, time)
    labeled_data = stocks.map(map_to_point)
    labeled_data = labeled_data.filter(lambda x: x)
    labeled_data = labeled_data.map(lambda x: LabeledPoint(x[0], x[1])).cache()
    training, test = labeled_data.randomSplit([0.7, 0.3])
    # verify that the data exists
    if training.isEmpty():
        metrics.append([])
        continue
    # train the model
    model = LinearRegressionWithSGD.train(training,
                                          iterations=1000,
                                          step=0.00000001,
                                          intercept=True)
    test_features = test.map(lambda x: x.features)
    predictions = model.predict(test_features)
    test_preds = test.map(lambda x: x.label).zip(predictions)

    # grab percent error
    total_percent = test_preds.map(map_percent_error)
    total_percent = total_percent.filter(lambda x: x)
    # check to make sure not empty rdd
    if total_percent.isEmpty():
        metrics.append([])
        continue
    average_percent = total_percent.reduce(lambda x, y:
                                           (x[0] + y[0], x[1] + y[1]))
    average_percent = average_percent[0] / average_percent[1]
Beispiel #44
0
from pyspark.mllib.regression import LinearRegressionWithSGD as lrSGD

spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

regressionDataFrame = spark.read.csv('Advertising.csv',
                                     header=True,
                                     inferSchema=True)

regressionDataFrame = regressionDataFrame.drop('_c0')

regressionDataFrame.show(10)

regressionDataRDD = regressionDataFrame.rdd.map(list)

regressionDataLabelPoint = regressionDataRDD.map(
    lambda data: LabeledPoint(data[3], data[0:3]))

regressionLabelPointSplit = regressionDataLabelPoint.randomSplit([0.7, 0.3])

regressionLabelPointTrainData = regressionLabelPointSplit[0]

regressionLabelPointTestData = regressionLabelPointSplit[1]

ourModelWithLinearRegression = lrSGD.train(data=regressionLabelPointTrainData,
                                           iterations=200,
                                           step=0.02,
                                           intercept=True)
Beispiel #45
0
print parsedData.take(3)


# In[58]:

#Devide rawData into Traning, Validation and Test
weights = [.8, .1, .1]
seed = 50
parsedTrainData, parsedValData, parsedTestData = parsedData.randomSplit(weights, seed)


# In[64]:

# Fit the model with default values
fitModel = LinearRegressionWithSGD.train(parsedTrainData)
print  fitModel


# In[65]:

# Prediction 
testPoint = parsedTrainData.take(1)[0]

print testPoint.label

testPrediction = fitModel.predict(testPoint.features)

print samplePrediction

def run_prdct(pr_values, sc):
    values = list(pr_values)

    # Configure
    train_path = '/Users/xiaoru_zhu/PycharmProjects/HousingPriceDA/Dataset/train.csv'

    # Initialize RDD
    rdd_lines = sc.textFile(train_path)
    head = rdd_lines.first()
    rdd_lines = rdd_lines.filter(lambda ln: ln != head) \
                         .mapPartitions(lambda x: csv.reader(x)) \
                         .persist(StorageLevel(True, True, False, False, 1))  # MEMORY_AND_DISK
    # data_num = rdd_lines.count()

    # Prepare for normalization
    sub = []
    minimum = []
    for index in range(5, 8):
        max_ = float(rdd_lines.map(lambda attr: attr[index]).max(key=float))
        min_ = float(rdd_lines.map(lambda attr: attr[index]).min(key=float))
        subtract = max_ - min_
        minimum.append(min_)
        sub.append(subtract)

    # Normalization(gui yi): (val - min)/(max - min), to let number feature values in [0, 1] and narrow down the error
    def normalization(line):
        line[5] = (float(line[5]) - minimum[0]) / sub[0]
        line[6] = (float(line[6]) - minimum[1]) / sub[1]
        line[7] = (float(line[7]) - minimum[2]) / sub[2]
        return line

    rdd_lines = rdd_lines.map(lambda attr: normalization(attr))
    values = normalization(values)

    # print(rdd_lines.first())  # test after normalization


    # extract features from every category column and generate dict
    def be_mapped(rdd_arg, column):
        return rdd_arg.map(lambda attr: attr[column]) \
                      .distinct() \
                      .zipWithIndex() \
                      .collectAsMap()  # result : {'BATH BEACH': 0, 'BAY RIDGE': 1, 'BEDFORD STUYVESANT': 2, ...}

    mappings = [be_mapped(rdd_lines, i) for i in [0, 1, 2, 8]]  # collect dicts into a list
    print('category feature mapping dict:', mappings)
    cat_len = sum(map(len, [i for i in mappings]))  # category feature numbers using sum + map function
    num_len = len(rdd_lines.first()[5:8])  # number feature numbers,index = 5,6,7
    total_len = num_len + cat_len  # total feature numbers
    ''' >>> TEST
    print('category feature number: %d' % cat_len)
    print('number feature number: %d' % num_len)
    print('total feature number::%d' % total_len)
    '''

    # Create eigenvectors(feature vectors) for linear regression
    def extract_features(line):
        cat_vec = np.zeros(cat_len)  # new array for category features, init 0 for all elements
        step = 0
        for i, raw_feature in enumerate([line[0], line[1], line[2], line[8]]):  # [(0,line[0]), (1,line[1], ...) ]
            dict_cate = mappings[i]  # category feature mapping dict {'BATH BEACH': 0, 'BAY RIDGE': 1, 'xxx': 2, ...}
            idx = dict_cate[raw_feature]  # get value from dict
            cat_vec[idx + step] = 1  # set 1 for index in array
            step = step + len(dict_cate)  # jump to the next attribute area
        num_vec = np.array([float(raw_feature) for raw_feature in line[5:8]])
        return np.concatenate((cat_vec, num_vec))  # splice category and number vectors

    def extract_label(line):
        return float(line[-1])


    # Error analysis
    def squared_error(actual, prdct):  # Mean Squared Error 均方误差
        return (prdct - actual) ** 2

    def abs_error(actual, prdct):  # Mean Absolute Error 平均绝对误差
        return np.abs(prdct - actual)

    def squared_log_error(prdct, actual):  # Root Mean Squared Log Error 均方根对数误差
        return (np.log(prdct + 1) - np.log(actual + 1)) ** 2

    # Adjust argument # there is no TEST dataset, using train data as test data!
    def evaluate(train_set, iterations, step, reg_param, reg_type, intercept):
        # create linear model using Stochastic gradient descent(随机梯度下降)
        model = LinearRegressionWithSGD.train(train_set, iterations, step, regParam=reg_param, regType=reg_type,
                                              intercept=intercept)
        # use test data -> rdd: [(actual_value, prdict_value), (...), (...), ......]
        tlabel_tprediction = train_set.map(lambda point: (point.label, model.predict(point.features)))
        # calculate Root Mean Squared Log Error
        rmsle = np.sqrt(tlabel_tprediction.map(lambda tp: squared_log_error(tp[0], tp[1])).mean())
        return rmsle

    # Generate the final feature vectors by 'map' and 'extract' function
    data = rdd_lines.map(lambda line: LabeledPoint(extract_label(line), extract_features(line)))
    #first_point = data.first()

    values_vec = extract_features(values)

    # create linear model and test
    linear_model = LinearRegressionWithSGD.train(data, iterations=200, step=0.05, intercept=False)
    true_vs_predicted = data.map(lambda point: (point.label, linear_model.predict(point.features)))
    print('The first five prediction values: ' + str(true_vs_predicted.take(5)))  # test

    rst = linear_model.predict(values_vec)

    # error analysis
    m_s_e = true_vs_predicted.map(lambda tp: squared_error(tp[0], tp[1])).mean()
    m_a_e = true_vs_predicted.map(lambda tp: abs_error(tp[0], tp[1])).mean()
    r_m_s_l_e = np.sqrt(true_vs_predicted.map(lambda tp: squared_log_error(tp[0], tp[1])).mean())
    # print('Linear Model - Mean Squared Error: %2.4f' % m_s_e)
    print('Linear Model - Mean Absolute Error: %2.4f' % m_a_e)
    print('Linear Model - Root Mean Squared Log Error: %2.4f' % r_m_s_l_e)

    '''
    # adjust 'iterations' argument
    args_it = [1, 5, 10, 20, 50, 100, 200]
    error_it = [evaluate(data, arg, 0.01, 0.0, 'l2', False) for arg in args_it]
    for i in range(len(args_it)):
        print('the r_m_s_l_e:%f when iteration = %f' % (error_it[i], args_it[i]))

    # adjust 'step' argument
    args_stp = [0.01, 0.025, 0.05, 0.1, 0.3, 0.5, 1.0]
    error_stp = [evaluate(data, 10, arg, 0.0, 'l2', False) for arg in args_stp]

    for i in range(len(args_stp)):
        print('the r_m_s_l_e:%f when step = %f' % (error_stp[i], args_stp[i]))
    '''
    rst = round(rst, 2)
    r_m_s_l_e = round(r_m_s_l_e, 2)
    m_a_e = round(m_a_e, 2)
    rst_lst = [rst, r_m_s_l_e, m_a_e]
    print(rst_lst)
    return rst_lst
from pyspark.mllib.evaluation import RegressionMetrics

# Cargar y parsear la data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("/home/master/ejemplos-python/lpsa.data")
parsedData = data.map(parsePoint)


# Divide la data en 2 set, de entrenamiento y pruebas
# Aquí he establecido la semilla para que pueda reproducir el resultado
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3], seed=100)
# contruir el modelo
model = LinearRegressionWithSGD.train(trainingData)
# evaluar el modelo y entrenar
# --- Point 1 ---
Preds = testData.map(lambda p: (float(model.predict(p.features)), p.label))
MSE = Preds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / Preds.count()
print("Mean Squared Error = " + str(MSE))
print("\n")
# --- Point 2 ---
# Más acerca del modelo y evaluar el analisis de regresión
# Instanciar el objeto
metrics = RegressionMetrics(Preds)
# Squared Error
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)
# R-squared
print("R-squared = %s" % metrics.r2)
Beispiel #48
0
def textParser():
    datas = []
    lines = open('abalone.txt').readlines()
    for line in lines:
        tmp = line.strip().split('\t')
        datas.append(LabeledPoint(tmp[-1], tmp[1:-1]))
    return datas


if __name__ == '__main__':
    sc = SparkContext()
    datas = sc.parallelize(textParser())
    print datas.collect()[0]
    model = LinearRegressionWithSGD.train(datas,
                                          step=2,
                                          iterations=100,
                                          intercept=True,
                                          regType='l2')
    print '**' * 50
    print model.weights
    print model.intercept
    print '**' * 50
    # 计算预测模型与训练值得方差
    prevals = datas.map(lambda p: (p.label, model.predict(p.features)))
    MSE = prevals.map(lambda (v, p):
                      (v - p)**2).reduce(lambda x, y: x + y) / prevals.count()
    print u'方差:', str(MSE)
    print u'测试数据值为:', datas.collect()[0]
    print u'模型预期数据:', model.predict(array(datas.collect()[0].features))
    sc.stop()
#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(
    scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(
    scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))

#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)

#Section 7.5.1
validPredicts = validScaled.map(lambda x:
                                (float(model.predict(x.features)), x.label))
validPredicts.collect()
import math
RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean())

#Section 7.5.2
from pyspark.mllib.evaluation import RegressionMetrics
validMetrics = RegressionMetrics(validPredicts)
validMetrics.rootMeanSquaredError
Statistics.corr(rdd1,rdd2,method)计算两个RDD的相关矩阵,method同上
Statistics.chiSqTest(rdd)计算由LabeledPoint对象组成的RDD中每个特征与标签的皮尔森独立性测试,
                         返回一个ChiSqTestResult对象,其中有p值,测试统计,每个特征的自由度.特征和标签必须是分类的,即离散值
"""
# 11.5.3分类与回归
"""
分类和回归都会使用MLlib中的LabeledPoint类(在mllin.regression包中)
一个 LabeledPoint 其实就是由一个 label( label 总是一个 Double 值,
不过可以为分类算法设为离散整数)和一个 features 向量组成
"""
# 线性回归
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD

points = sc.parallelize(LabeledPoint([1, 2, 3], 1))  # 创建LabeledPoint组成的RDD
model = LinearRegressionWithSGD.train(points, iterations=200, intercept=True)
print model.weights, model.intercept
# 逻辑回归
# 支持向量机
# 朴素贝叶斯
# 决策树与随机森林
# 11.5.4聚类
# KMeans

# 11.5.5协同过滤与推荐
# 11.5.6降维
# 1.主成分分析
# 2.奇异值分解
# 11.5.7模型评估
# 11.6
from pyspark.mllib.clustering import KMeans
Beispiel #51
0
def taxi_regression(sc, filename):
    '''
    Args:
        sc: The Spark Context
        filename: Filename of the Amazon reviews file to use, where each line represents a review    
    '''

    sqlContext = SQLContext(sc)
    df = sqlContext.read.load(filename,
                              format='com.databricks.spark.csv',
                              header='true',
                              inferSchema='true').sample(False, 0.001)

    df = df.filter((df.pickup_longitude < -73.75)
                   & (df.pickup_longitude > -74.05)
                   & (df.dropoff_longitude < -73.75)
                   & (df.dropoff_longitude > -74.05))
    df = df.filter((df.pickup_latitude < 40.9) & (df.pickup_latitude > 40.6)
                   & (df.dropoff_latitude < 40.9)
                   & (df.dropoff_latitude > 40.6))

    discretizer1 = QuantileDiscretizer(numBuckets=100,
                                       inputCol="pickup_latitude",
                                       outputCol="pickup_latitude_bucket")
    discretizer2 = QuantileDiscretizer(numBuckets=100,
                                       inputCol="pickup_longitude",
                                       outputCol="pickup_longitude_bucket")
    discretizer3 = QuantileDiscretizer(numBuckets=100,
                                       inputCol="dropoff_latitude",
                                       outputCol="dropoff_latitude_bucket")
    discretizer4 = QuantileDiscretizer(numBuckets=100,
                                       inputCol="dropoff_longitude",
                                       outputCol="dropoff_longitude_bucket")
    result = discretizer1.fit(df).transform(df)
    result = discretizer2.fit(result).transform(result)
    result = discretizer3.fit(result).transform(result)
    result = discretizer4.fit(result).transform(result)

    vecAssembler3 = VectorAssembler(inputCols=[
        "pickup_latitude_bucket", "pickup_longitude_bucket",
        "dropoff_latitude_bucket", "dropoff_longitude_bucket"
    ],
                                    outputCol="features")
    transformed = vecAssembler3.transform(result)
    # cluster_df = transformed.select("pickup_latitude","pickup_longitude","predction_pickup")
    # cluster_df.write.format("com.databricks.spark.csv").option("header", "true").save("file.csv")
    transformed = transformed.select("features", "fare_amount")

    labeled_rdd = transformed.rdd.map(lambda x: get_labeled_point(x))

    for row in labeled_rdd.collect():
        print(row)
    training_data, test_data = labeled_rdd.randomSplit([0.8, 0.2])
    model = LinearRegressionWithSGD.train(training_data,
                                          iterations=100,
                                          step=0.2)

    valuesAndPredsTraining = training_data.map(
        lambda p: (float(model.predict(p.features)), p.label))
    valuesAndPreds = test_data.map(lambda p:
                                   (float(model.predict(p.features)), p.label))

    trainingMetrics = RegressionMetrics(valuesAndPredsTraining)
    metrics = RegressionMetrics(valuesAndPreds)

    print("RMSE = ", metrics.rootMeanSquaredError, " Explained Variance = ",
          metrics.explainedVariance, " RMSE Training = ",
          trainingMetrics.rootMeanSquaredError)
Beispiel #52
0
from StringIO import StringIO

from pyspark import SparkConf, SparkContext
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD


# Load and parse the data
def parsePoint(line):
    values = csv.reader(StringIO(line),
                        delimiter=";").next()  # CSV parsing of line
    values = [float(x) for x in values]  # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])  # y = quality, X = row[:-1]


if __name__ == '__main__':
    conf = SparkConf().setMaster("local[*]").setAppName("Wine Regression")
    sc = SparkContext(conf=conf)

    wines = sc.textFile("winequality-red.csv")
    parsedData = wines.map(parsePoint)

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData)

    # Evaluate the model on training data
    valuesAndPreds = parsedData.map(lambda p:
                                    (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(
        lambda x, y: x + y).count() / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
housingTrain = sets[0]
housingValid = sets[1]

#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))

#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)

#Section 7.5.1
validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label))
validPredicts.collect()
import math
RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())

#Section 7.5.2
from pyspark.mllib.evaluation import RegressionMetrics
validMetrics = RegressionMetrics(validPredicts)
validMetrics.rootMeanSquaredError
validMetrics.meanSquaredError
"""

import sys
from pyspark.mllib.regression import LinearRegressionWithSGD

from spark_application import create_spark_application
from data_loader import DataLoader
from reader import read_districts_file

# Get file paths from arguments
if len(sys.argv) != 4:
    print "Usage: linear_regression.py FEATURES_FILE MODEL_FOLDER DISTRICTS_FILE"
    sys.exit()
features_file, model_folder, districts_file = sys.argv[1:]

spark_context, sql_context = create_spark_application(
    "train_linear_regression")
data_loader = DataLoader(spark_context, sql_context, features_file)
data_loader.initialize()

# train and store a model for each district in the districts file
for lat, lon in read_districts_file(districts_file):
    print("Training District: %f, %f" % (lat, lon))
    model = LinearRegressionWithSGD.train(data_loader.get_train_data(
        (lat, lon)),
                                          iterations=1000,
                                          step=1e-1)
    # save the model in the specified model_folder
    model.save(spark_context,
               '%s/model_%s_%s' % (model_folder, str(lat), str(lon)))
Beispiel #55
0
    def printMetrics(model):
        predictions_and_labels = test.map(lambda lr: (float(model.predict(lr.features)), lr.label))
        metrics = RegressionMetrics(predictions_and_labels)
        f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance))
        f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError))
        f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError))
        f.write('Root Mean Squared Error:{0}\n'.format(metrics.rootMeanSquaredError))
        f.write('R^2 :{0}\n'.format(metrics.r2))

    for j in range(numModels):
        regp = paramGrid[j]['regParam']
        iters = paramGrid[j]['iterations']
        regt = paramGrid[j]['regType']
   
        timestart = datetime.datetime.now()

        f.write('Model{0}: regParam = {1}, iterations = {2}, regType = {3}\n'.format(str(j), regp, iters, regt))
        # Train linear regression model with hypermarameter set
        model = LinearRegressionWithSGD.train(training, iterations=iters, \
            step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=regp, \
            regType=regt, intercept=False, validateData=True)
        printMetrics(model)

        timeend = datetime.datetime.now()
        timedelta = round((timeend-timestart).total_seconds(), 2) 
        f.write("Time taken to execute this model is: " + str(timedelta) + " seconds.\n")

    f.close()
    sc.stop()
# In[77]:

from pyspark.mllib.regression import LinearRegressionWithSGD
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept

# In[79]:

# TODO: Replace <FILL IN> with appropriate code
firstModel = LinearRegressionWithSGD.train(parsedTrainData, numIters, alpha,
                                           miniBatchFrac, None, reg, regType,
                                           useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print weightsLR1, interceptLR1

# In[80]:

# TEST LinearRegressionWithSGD (4a)
expectedIntercept = 13.3335907631
expectedWeights = [
    16.682292427, 14.7439059559, -0.0935105608897, 6.22080088829,
    4.01454261926, -3.30214858535, 11.0403027232, 2.67190962854, 7.18925791279,
    4.46093254586, 8.14950409475, 2.75135810882
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept


# In[62]:

# TODO: Replace <FILL IN> with appropriate code
firstModel = LinearRegressionWithSGD.train(parsedTrainData, 
                                           iterations=numIters, 
                                           step=alpha, 
                                           miniBatchFraction=miniBatchFrac, 
                                           initialWeights=None, 
                                           regParam=reg, 
                                           regType=regType, 
                                           intercept=useIntercept 
                                           )

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print weightsLR1, interceptLR1


# In[63]:

# TEST LinearRegressionWithSGD (4a)
expectedIntercept = 13.3335907631
Beispiel #58
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd,
                                          initialWeights=array([1.0, 1.0]))
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
        except ValueError:
            self.fail()
	return LabeledPoint(values[7], values[0:11]) 

#data_file = sc.textFile("/home/faiz89/Desktop/Eastman/2008.csv")
data_file = sc.textFile("../2008_small.csv")
header = data_file.first ()
raw_data = data_file.filter (lambda x:x != header)

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = LinearRegressionWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()
testData.cache()
# Evaluating the model on training data
valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds \
    .map(lambda (v, p): (v - p)**2) \
    .reduce(lambda x, y: x + y) / valuesAndPreds.count()
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Total Time: '), (datetime.now() - startTime)

print("Mean Squared Error = " + str(MSE))
# Save and load model
model.save(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
sameModel = LinearRegressionModel.load(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
Beispiel #60
0
    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = line.split()
        return LabeledPoint(
            int(values[0]),
            DenseVector([int(x.split(':')[1]) for x in values[1:]]))

    data = sc.textFile(
        "/Users/hugomathien/Documents/workspace/footballdata/learning_vector/learningVector8.txt"
    )
    parsedData = data.map(parsePoint)

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData,
                                          iterations=1000000,
                                          step=0.0000000000001)

    # Get predictions
    valuesAndPreds = parsedData.map(
        lambda p: (float(model.predict(p.features)), p.label))

    # Instantiate metrics object
    metrics = RegressionMetrics(valuesAndPreds)

    # Squared Error
    print("MSE = %s" % metrics.meanSquaredError)
    print("RMSE = %s" % metrics.rootMeanSquaredError)

    # R-squared
    print("R-squared = %s" % metrics.r2)