def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def main(): #reading test and train data trainData = sc.pickleFile(input + '/Train_data_unnormalized.pickle/part-00000') testData = sc.pickleFile(input + '/Test_data_unnormalized.pickle/part-00000') parsedData = trainData.map(parseInput).filter( lambda line: len(line.features) != 0 or len(line.label) != 0) parsedTestData = testData.map(parseInput).filter( lambda line: len(line.features) != 0 or len(line.label) != 0) numIterations = 100 stepSize = [0.1, 10, 20] BestError = 1000000 BestStep = 0 BestSplit = [] splits = [[1, 2], [1, 3]] #Cross Validation for x in stepSize: for y in splits: (Train_RDD, Valid_RDD) = trainData.randomSplit(y, 10L) parsed_input = Train_RDD.map(parseInput).filter( lambda line: len(line.features) != 0 or len(line.label) != 0) parsed_valid = Valid_RDD.map(parseInput).filter( lambda line: len(line.features) != 0 or len(line.label) != 0) try: model = LinearRegressionWithSGD.train(parsed_input, iterations=numIterations, step=x) valuesAndPreds = parsed_valid.map( lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() RMSE = math.sqrt(MSE) except Exception: pass if RMSE < BestError: BestError = RMSE BestStep = x BestSplit = y #Finding test error model = LinearRegressionWithSGD.train(parsedData, iterations=numIterations, step=BestStep) valuesAndPreds = parsedTestData.map(lambda p: (p.label, model.predict(p.features))) MSE_test = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() RMSE_test = math.sqrt(MSE_test) print("Best Root Mean Squared Error Validation = " + str(BestError)) print("Best Root Mean Squared Error Test= " + str(RMSE_test)) print("Best StepSize = " + str(BestStep)) print(BestSplit)
def regularized(trainingData, testData, trainingSize, testSize, regTypeVal): ''' Least square with l1 norm: lasso ''' # train a lr model numIterValList = [3000, 5000, 10000] stepSizeValList = [1e-11, 1e-9, 1e-7] regParamValList = [0.01, 0.1, 1, 10] # variable for the best parameters bestNumIterVal = 200 bestStepSizeVal = 1 bestTrainingRMSE = 1e10 bestRegParamVal = 0.0 for numIterVal, stepSizeVal, regParamVal in itertools.product( numIterValList, stepSizeValList, regParamValList): model = LinearRegressionWithSGD.train(trainingData, iterations=numIterVal, step=stepSizeVal, regParam=regParamVal, regType=regTypeVal) ValsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features))) trainingRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / trainingSize) if trainingRMSE: if trainingRMSE < bestTrainingRMSE: bestNumIterVal = numIterVal bestStepSizeVal = stepSizeVal bestTrainingRMSE = trainingRMSE print numIterVal, stepSizeVal, regParamVal, trainingRMSE print bestNumIterVal, bestStepSizeVal, bestRegParamVal, bestTrainingRMSE model = LinearRegressionWithSGD.train(trainingData, iterations=bestNumIterVal, step=bestStepSizeVal, regParam=regParamVal, regType=regTypeVal) # Evaluating the model on training data ValsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features))) trainingRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / trainingSize) print trainingRMSE # Evaluating the model on training data ValsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / testSize) print testRMSE pass
def hadamard_fit(data): # sample 1024 terms from data parsedData = data.map(lambda line: np.array([float(x) for x in line.split(',')])) rdd3 = sc.parallelize(parsedData.takeSample(True, 1024),2) # create Hadamard matrix N = 10 H = np.zeros([1024, 1024]) H[0, 0] = 1 h = 1 for i in range(N): H[0:h, h:2 * h] = H[0:h, 0:h] H[h:2 * h, 0:h] = H[0:h, 0:h] H[h:2 * h, h:2 * h] = -1 * H[0:h, 0:h] h = h * 2 # multiply with Hadamard matrix lens = rdd3.collect()[0].shape[0] X_array = np.array(rdd3.collect()).reshape(1024, lens) X_hadamard = H.dot(X_array) x_rdd = sc.parallelize(X_hadamard) # each entry is an numpy array subset = x_rdd.map(lambda x: LabeledPoint(x[-1], x[0:lens - 1])) \ .randomSplit([0.8, 0.2]) # split training and testing x_rp = subset[0].filter(mat_B_filter) # mat B actually serve as a filter model3 = LinearRegressionWithSGD.train(x_rp, iterations=100, step=0.00000001, regType=None) # Evaluate the model on training data valuesAndPreds = subset[1].map(lambda p: (p.label, model3.predict(p.features))) MSE = valuesAndPreds \ .map(lambda vp: (vp[0] - vp[1]) ** 2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE))
def regression(): #Regression Point #Reads the data from the joinedResults directory as a parquet file datadf = sqlContext.read.parquet(output+"/joinedResults") datadf.show() data = datadf.rdd.map(lambda w: (float(w.avg_prcp), int(w.yy), float(w.latitude), float(w.longitude))) max_prcp = data.max() min_prcp = data.min() lat = data.map(lambda x: (x[2])).cache() min_lat = lat.min() max_lat = lat.max() longt = data.map(lambda x: (x[3])).cache() min_long = longt.min() max_long = longt.max() max_ = [max_prcp[0], float(2050), max_lat, max_long] min_ = [min_prcp[0], float(1990), min_lat, min_long] # change the format to fit in LinearRegression library parsedData = data.map(lambda x: parsePointPrediction(x, max_, min_)).cache() # Split data aproximately into training (80%) and test (20%) trainData, testData = parsedData.randomSplit([0.8, 0.2], seed = 0) trainData.cache() testData.cache() # Build the model using Try and error to find out the Parameters. model = LinearRegressionWithSGD.train(trainData, iterations =500, regType="l2", regParam=10, intercept="true" ) # Evaluate the model on test data valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) maxVal=max_prcp[0] model.save(sc, output+"/modelpath") return
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0)
def main(sc): # Loading the features: features_cr = sc.pickleFile('/tmp/features_saved') print(features_cr.first()) # Getting the features ready for training numberFeatures = len(features_cr.first()) - 1 mappings = [get_mapping(features_cr, i) for i in range(0, numberFeatures)] # Working with the Mapping: # Month: dictio_month = {} for i in range(12): dictio_month[i + 1] = i mappings[1] = dictio_month # Year: ? cat_len = sum(map(len, mappings)) data = features_cr.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) print(features_cr.first()) # Regression: linear_model = LinearRegressionWithSGD.train(data, iterations=100, step=0.25, intercept=False) linear_model.save(sc, '/tmp/linear_model') print 'OK model'
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0)
def main(): spark = SparkSession.builder.appName("TRAFFIC").config( "spark.executor.cores", "6").config("spark.executor.memory", "6g").getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) raw_data = sc.textFile("s3a://insighttraffic/dot_traffic_2015.txt") header = raw_data.first() records = raw_data.filter(lambda line: line != header).map( lambda x: x.split(",")) records.cache() mappings = [get_mapping(records, i) for i in range(1, 11)] category_len = reduce(lambda x, y: x + y, map(len, mappings)) boto3.resource('s3').Object('insighttraffic', 'ML_model/mappings').put(Body=str(mappings)) for hour in range(0, 24): data_log = records.map(lambda r: LabeledPoint( extract_label(r, hour + 13), extract_features(r, hour, category_len, mappings)) ) #log transformed data linear_model_log = LinearRegressionWithSGD.train(data_log, iterations=100, step=0.01, intercept=True) linear_model_log.save( sc, "s3a://insighttraffic/ML_model/linear_model_log_" + str(hour)) sc.stop()
def get_model(self, dataf, num_iter, step_size, mini_batch_frction): model = LinearRegressionWithSGD.train( dataf, iterations=num_iter, step=step_size, miniBatchFraction=mini_batch_frction) return model
def get_best_stepsize(step_sizes, training_lp, iterations, cv_trails): best_stepsize = 0 lowest_RMSE = float("inf") num_folds = 4 fold_set = [1] * num_folds cv_data = training_lp.randomSplit(fold_set) # 4 folds for step_size in step_sizes: total_RMSE = 0.0 for i in range(num_folds): cv_testing = cv_data[i] cv_training = training_lp.subtract(cv_testing) model = LinearRegressionWithSGD.train(cv_training, iterations=iterations, step=step_size) values_and_preds = cv_testing.map( lambda p: (p.label, model.predict(p.features))) MSE = values_and_preds.map(lambda (v, p): (v - p)**2).reduce( operator.add) RMSE = math.sqrt(MSE) total_RMSE += RMSE avg_RMSE = total_RMSE / cv_trails if avg_RMSE < lowest_RMSE: lowest_RMSE = avg_RMSE best_stepsize = step_size return best_stepsize
def train_model(data, rdd): """ 分别使用scikit-learn和Spark MLlib训练模型 """ sklearn_model = sklearnLR() sklearn_model.fit(data[:, 1:], data[:, 0]) mllib_model = LinearRegressionWithSGD.train(rdd, intercept=True) return sklearn_model, mllib_model
def evaulate(train, test, iterations, step, regParam, regType, intercept): model = LinearRegressionWithSGD.train(train, iterations=iterations, step=float(step), intercept=intercept) tp = test.map(lambda p: (p.label, model.predict(p.features))) rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean()) return rmsle
def get_best_result(best_step_size, training_lp, testing_lp, iterations): model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=best_step_size, regType = 'l2') values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features))) MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add) RMSE = math.sqrt(MSE) result_str = 'best step size got by cross validation cv: ' + str(best_step_size) + ', lowest RMSE: ' + str(RMSE) return result_str
def evaluate(train,test,iterations,step,regParam,regType,intercept): model = LinearRegressionWithSGD.train(train, iterations, step,regParam=regParam, regType=regType, intercept=intercept) tp = test.map(lambda p: (p.label, model.predict(p.features))) rmse = np.sqrt(tp.map(lambda (t,p): squarred_error(t,p)).mean()) mae = np.sqrt(tp.map(lambda (t,p): abs_error(t,p)).mean()) rmsle = np.sqrt(true_vs_predicted.map(lambda (t,p): squared_log_error(t,p)).mean()) opt_metrics = [rmse,mae,rmsle] return opt_metrics
def Regression_Model(filename): open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data( filename) output = [] for i in range(1, len(Date)): tmp = LabeledPoint(label=True_price_train[i], features=[close_price_train[i]]) output.append(tmp) output_train_RDD = sc.parallelize(output).cache() lrm = LinearRegressionWithSGD.train(output_train_RDD, step=0.001, iterations=100000) tree = DecisionTree.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=30) forest = RandomForest.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=30) gradient = GradientBoostedTrees.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numIterations=10) print("\n============MODEL Evaluation=============\n") model_name = [ 'LinearRegression', 'DecisionTree', 'RandomForest', 'GradientBoostedTrees' ] es_modelname = ['lrm', 'tree', 'forest', 'gradient'] result = '' x = 0 err = 1000 test_model = 'LinearRegression' #此处更换不同的RDD output_model_RDD = lrm for model in [lrm, tree, forest, gradient]: predictions = model.predict(output_train_RDD.map(lambda x: x.features)) labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip( predictions) MSE = ( labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(output_train_RDD.count()))**0.5 #print ("Predictions: ", valuesAndPreds.take(10)) result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n" if (err > MSE): err = MSE output_model = model es_model = es_modelname[x] x += 1 print(result) print(es_model) return Date, True_price, output_model_RDD, open_price, close_price, es_model
def getRMSE(step_array): valRMSE_list = [] for step in step_array: model = LinearRegressionWithSGD.train(train_featureScoreTimeRDD, iterations=5000, step=step) labelsAndPreds = val_featureScoreTimeRDD.map(lambda p: (p.label, model.predict(p.features))) valMSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / val_featureScoreTimeRDD.count() valRMSE=valMSE**0.5 valRMSE_list.append((step, valRMSE)) return valRMSE_list
def get_model_weight(self, dataf, weights, num_iter, step_size, mini_batch_frction): model = LinearRegressionWithSGD.train( dataf, iterations=num_iter, step=step_size, miniBatchFraction=mini_batch_frction, initialWeights=weights) return model
def evaluate(train_set, iterations, step, reg_param, reg_type, intercept): # create linear model using Stochastic gradient descent(随机梯度下降) model = LinearRegressionWithSGD.train(train_set, iterations, step, regParam=reg_param, regType=reg_type, intercept=intercept) # use test data -> rdd: [(actual_value, prdict_value), (...), (...), ......] tlabel_tprediction = train_set.map(lambda point: (point.label, model.predict(point.features))) # calculate Root Mean Squared Log Error rmsle = np.sqrt(tlabel_tprediction.map(lambda tp: squared_log_error(tp[0], tp[1])).mean()) return rmsle
def rmse_mae_gd(trainset, testset): #Stochastic gradient descent with l1 model_sgd_l1 = LinearRegressionWithSGD.train(trainset, miniBatchFraction = 0.00001, regParam=0.1, regType= 'l1', iterations=50, step=0.00000001) predicted = testset.map(lambda p: (p.label, model_sgd_l1.predict(p.features))) RMSE_l1 = sqrt(predicted.map(lambda vp: (vp[0] - vp[1])**2).reduce(lambda x, y: x + y) / predicted.count()) MAE_l1 = predicted.map(lambda vp: abs(vp[0] - vp[1])).reduce(lambda x, y: x + y) / predicted.count() mean_RMSE.append(RMSE_l1) print ("Root Mean Squared Error for Stochastic Gradient Descent with l1: " + str(RMSE_l1)) print ("Mean Absolute Error for Stochastic Gradient Descent with l1: " + str(MAE_l1))
def linearRegression(features,sc,output_n): features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:116] linearregression_model = LinearRegressionWithSGD.train(training_data,iterations=0,regParam=200) prediction = testing_data.map(lambda line: (line.label, linearregression_model.predict(line.features))) return linearregression_model,prediction
def rr_fit(parsed_Data): rdd = parsed_Data.randomSplit([0.8, 0.2]) model = LinearRegressionWithSGD.train(rdd[0], iterations=100, step=0.00000001, regType="l2") # Evaluate the model on training data valuesAndPreds = rdd[1].map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)\ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE))
def train_regression(data): model = LinearRegressionWithSGD.train(data, iterations=100, step=0.00000001) valuesAndPreds = data.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds \ .map(lambda (v, p): (v - p) ** 2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) return model
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca): pca_n = 2 sc = SparkContext(master) data = sc.textFile(dataPath) # not RDD data ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)]))) if label == 0: ndata = ndata.map(lambda line: line[::-1]) if normalize == 1: test_data = norm(ndata.collect()) norm_data = sc.parallelize(test_data) train_data = norm_data.map(lambda part: lbp(part[0], part[1])) #raw_data = data.map(lambda line: line.split(character)) else: test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect() train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1])) if ispca == 1: pca = PCA(n_components = pca_n) pca_train = [test_data[i][1] for i in range(len(test_data))] pca_data = pca.fit(pca_train).transform(pca_train) test = [] for i in range(len(pca_data)): test.append([test_data[i][0], pca_data[i]]) train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1])) test_data = test model_lr = lr.train(train_data) err_lr = 0.0 size = len(train_data.collect()) for i in range(size): err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0]) print "result:", err_lr/size String = "Linear Regression Result:\n" String = String + str(model_lr.weights) + '\n' String = String + "Error: " + str(err_lr / size) sc.stop() return String
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def trainModel(data, rdd): """ 分别使用scikit-learn和Spark MLlib训练模型 """ sklearnModel = sklearnLR() sklearnModel.fit(data[:, 1:], data[:, 0]) # 调整超参数 mllibModel = LinearRegressionWithSGD.train( rdd, intercept=True, iterations=1000, miniBatchFraction=0.1, step=5, convergenceTol=1e-7) return sklearnModel, mllibModel
def evaluate(train, test, iterations, step, regParam, regType, intercept): model = LinearRegressionWithSGD.train(train, iterations, step, regParam=regParam, regType=regType, intercept=intercept) _tp = test.map(lambda p: (p.label, model.predict(p.features))) _rmsle = np.sqrt( _tp.map(lambda tp: squared_log_error(tp[0], tp[1])).mean()) return _rmsle
def iterateLRwSGDBatch(iterNums, stepSizes, fractions, train, valid): for numIter in iterNums: for step in stepSizes: for miniBFraction in fractions: alg = LinearRegressionWithSGD() model = alg.train(train, intercept=True, iterations=numIter, step=step, miniBatchFraction=miniBFraction) rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label)) validPredicts = valid.map(lambda x: (model.predict(x.features), x.label)) meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) print("%d, %5.3f %5.3f -> %.4f, %.4f" % (numIter, step, miniBFraction, meanSquared, meanSquaredValid))
def evaluate(train, test, iterations, step, regParam, regType, intercept): lrModel = LinearRegressionWithSGD.train(train, iterations, step, regParam=regParam, regType=regType, intercept=intercept) # weights of lr model # lrModel.weights actual_vs_pred = test.map(lambda p: (p.label, lrModel.predict(p.features))) #print actual_vs_pred.take(10) actual_pred_error(actual_vs_pred)
def linearRegression(features, sc, output_n): features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:116] linearregression_model = LinearRegressionWithSGD.train(training_data, iterations=0, regParam=200) prediction = testing_data.map(lambda line: ( line.label, linearregression_model.predict(line.features))) return linearregression_model, prediction
def iterateLRwSGD(iterNums, stepSizes, train, valid): from pyspark.mllib.regression import LinearRegressionWithSGD import math for numIter in iterNums: for step in stepSizes: alg = LinearRegressionWithSGD() model = alg.train(train, iterations=numIter, step=step, intercept=True) rescaledPredicts = train.map(lambda x: (float(model.predict(x.features)), x.label)) validPredicts = valid.map(lambda x: (float(model.predict(x.features)), x.label)) meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
def get_best_result(best_step_size, training_lp, testing_lp, iterations): model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=best_step_size) values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features))) MSE = values_and_preds.map(lambda (v, p): (v - p)**2).reduce(operator.add) RMSE = math.sqrt(MSE) result_str = 'best step size got by cross validation cv: ' + str( best_step_size) + ', lowest RMSE: ' + str(RMSE) return result_str
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def rmse_mae_gd(trainset, testset): model_gd = LinearRegressionWithSGD.train(trainset, iterations=50, step=0.00000001) predicted = testset.map(lambda p: (p.label, model_gd.predict(p.features))) RMSE = sqrt( predicted.map(lambda vp: (vp[0] - vp[1])**2).reduce(lambda x, y: x + y) / predicted.count()) MAE = predicted.map(lambda vp: abs(vp[0] - vp[1])).reduce( lambda x, y: x + y) / predicted.count() mean_RMSE.append(RMSE) print("Root Mean Squared Error for Gradient Descent: " + str(RMSE)) print("Mean Absolute Error for Gradient Descent: " + str(MAE))
def main(): records = get_records() mappings = [get_mapping(records, i) for i in range(2,10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings))) data_log = data.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features)) model_log = LinearRegressionWithSGD.train(data_log, iterations=10, step=0.1) true_vs_predicted_log = data_log.map(lambda p: (np.exp(p.label), np.exp(model_log.predict(p.features)))) calculate_print_metrics("Linear Regression Log", true_vs_predicted_log)
def xRMSerror (parsedDataTrain,parsedDataTest): numIterations = 1000 stepsize=0 model = LinearRegressionWithSGD.train(parsedDataTrain,numIterations,stepsize) # Evaluate the model on training data valuesAndPreds = parsedDataTest.map(lambda p: (p.label, model.predict(p.features))) print valuesAndPreds.take(5) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() return math.sqrt(MSE)
def get_best_stepsize(step_sizes, training_lp, testing_lp, iterations): best_stepsize = 0 lowest_RMSE = float("inf") for step_size in step_sizes: model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=step_size) values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features))) MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add) RMSE = math.sqrt(MSE) if RMSE < lowest_RMSE: lowest_RMSE = RMSE best_stepsize = step_size result_str = 'best step size: ' + str(best_stepsize) + ', lowest RMSE: ' + str(lowest_RMSE) return result_str
def lr_example(): min_freq = 1 n_common = 10 pwd = os.path.dirname(os.path.abspath(__file__)) path = pwd + '/example_data/twitter_2020-03-10_slim.csv' print(path) df = csv_parser.load_as_df(path, twitter_schema) df.show(3) converted = featurizer.convert_df_to_feature( df, n_common, min_freq).filter( lambda row: row['age'] is not None and row['feature'] is not None) converted = converted.map( # (age, sex, feature) lambda row: LabeledPoint(row['age'], concat_vectors(row['feature']))) converted = converted.zipWithIndex() sample = converted.take(3) train_rdd = converted.filter(lambda x: x[1] % 2 == 0).map(lambda x: x[0]) feature_dim = len(train_rdd.first().features) test_rdd = converted.filter(lambda x: x[1] % 2 == 1).map(lambda x: x[ 0]).filter(lambda x: len(x.features) == feature_dim).collect() print("confirming dim of train rdd") sample = train_rdd.take(3) for e in sample: print(e.features) print(len(e.features)) lrm = LinearRegressionWithSGD.train(train_rdd) n = len(test_rdd) mse = 0 # テスト for lp in test_rdd: gt = lp.label feat = lp.features pred = lrm.predict(feat) print(gt, pred) mse += (pred - gt) * (pred - gt) import math rmse = math.sqrt(mse / n) print('Root mean square error: ' + str(rmse))
def evaluate_lm(train_set, test_set, step, batch_pct, reg, reg_param, iterations=100): # Evalute the model on training data lm = LinearRegressionWithSGD.train(train_set, iterations=iterations, \ step=step, miniBatchFraction=batch_pct,\ regType=reg, regParam=reg_param,\ intercept=True, validateData=False ) values_and_preds = test_set.map(lambda x: (x.label, float(lm.predict(x.features)))) return get_lr_evals(values_and_preds)
def LinearRegression(filename, sc): filename = "/Users/Jacob/repository/SparkService/data/lpsa.data" data = sc.textFile(filename) parsedData = data.map(parsePoint) # train the model model = LinearRegressionWithSGD.train(parsedData) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n") # Save and load model #model.save(sc, "myModelPath") #sameModel = LinearRegressionModel.load(sc, "myModelPath")
def test_spark(): def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile(r"/usr/local/Cellar/apache-spark/1.6.1/libexec/data/mllib/ridge-data/lpsa.data") parsedData = data.map(parsePoint) print parsedData.collect() # Build the model model = LinearRegressionWithSGD.train(parsedData) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) print "Model coefficients:", str(model)
def algo(a): global data global week global target test = week week_target = week.map(convert) #apply(convert, axis=1) #np.random.seed(123) data_final = LabeledPoint(target, data) #make rdd that is input for algo if a == 'sgd': #time_0 = time.time() lrm = LinearRegressionWithSGD.train(sc.parallelize(data_final), iterations=10, initialWeights=np.array([1.0])) print (abs(lrm.predict(test))) print time.time() - time_0
def linearRegression(features,sc,output_n): features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:] labeled_training = [] labeled_testing = [] for x in training_features_labels: labeled_training.append(LabeledPoint(x[0],x[1])) for y in testing_features_labels: labeled_testing.append(LabeledPoint(y[0],y[1])) test = sc.parallelize(labeled_testing) linearregression_model = LinearRegressionWithSGD.train(labeled_training,iterations=0,regParam=200) predictions = test.map(lambda line: (line.label, float(linearregression_model.predict(line.features)))) return predictions
def linearRegression_f(mode): if mode == "no_reg": model = LinearRegressionWithSGD.train(parsedData) elif mode == "L1_reg": model = LassoWithSGD.train(parsedData) elif mode == "L2_reg": model = RidgeRegressionWithSGD.train(parsedData) else: print("ERROR Mode") #Evaluate the model on training data # parsedData map method to get {train_data, predict_data} pairs valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) #calculate the key-value pairs to get MSE MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count() return MSE
def LinearRegression(trainFile, testFile, taskid,sc): # filename = "/Users/Jacob/repository/SparkService/data/lpsa.data" # data = sc.textFile(filename) # parsedData = data.map(parsePoint) trainData = MLUtils.loadLibSVMFile(sc, trainFile) testData = MLUtils.loadLibSVMFile(sc, testFile) # train the model model = LinearRegressionWithSGD.train(trainData) # Evaluate the model on training data # predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features))) predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count() print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n") # Save and load model #model.save(sc, "myModelPath") #sameModel = LinearRegressionModel.load(sc, "myModelPath")
def get_best_stepsize(step_sizes, training_lp, iterations, cv_trails): best_stepsize = 0 lowest_RMSE = float("inf") num_folds = 4 fold_set = [1]*num_folds cv_data = training_lp.randomSplit(fold_set) # 4 folds for step_size in step_sizes: total_RMSE = 0.0 for i in range(num_folds): cv_testing = cv_data[i] cv_training = training_lp.subtract(cv_testing) model = LinearRegressionWithSGD.train(cv_training, iterations=iterations, step=step_size) values_and_preds = cv_testing.map(lambda p: (p.label, model.predict(p.features))) MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add) RMSE = math.sqrt(MSE) total_RMSE += RMSE avg_RMSE = total_RMSE/cv_trails if avg_RMSE < lowest_RMSE: lowest_RMSE = avg_RMSE best_stepsize = step_size return best_stepsize
def train_amount_model(self, model, data, i): rdd_data = self.sc.parallelize(data) self.logger.info('Start to train the amount model') if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK: input_num = self.feature_num layers = [input_num, input_num / 3 * 2, input_num / 3, 1] neural_network = NeuralNetworkSpark(layers=layers, bias=0) model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001, iteration=15, model=model) elif self.amount_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='variance', maxDepth=20, maxBins=32) elif self.amount_prediction_method == self.LINEAR_REGRESSION: model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=model.weights if model is not None else None) else: self.logger.error("Unknown training method {}".format(self.amount_prediction_method)) raise ValueError("Unknown training method {}".format(self.amount_prediction_method)) return model
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def learn_model(sc, file_path, normalize): feature_file = sc.textFile(file_path).map(lambda l:l.split("\t")) points = feature_file.map(lambda f: LabeledPoint(f[1], f[2:])) #normalizing if normalize: nor = Normalizer() labels = points.map(lambda x: x.label) features = points.map(lambda x: x.features) points = labels.zip(nor.transform(features)) points = points.map(lambda i: LabeledPoint(i[0], i[1])) training, testing = points.randomSplit([0.7,0.3],11) index = 0 iterations = 100 p_mse = -1 converge = False result = {} while(not converge): x = time.clock() model = LinearRegressionWithSGD.train(training, iterations=iterations, step=0.00001,intercept=True,regType="l1") y = time.clock() print("========== time = " + str(y - x)) preds = testing.map(lambda p: (p.label, model.predict(p.features))) MSE = preds.map(lambda r: (r[1] - r[0])**2).reduce(lambda x, y: x + y) / preds.count() print("========== MSE = " + str(MSE)) if p_mse == MSE: converge = True iterations = iterations +100 result[iterations] = MSE p_mse = MSE print(result) return model
return LabeledPoint(values[7], values[0:11]) #data_file = sc.textFile("/home/faiz89/Desktop/Eastman/2008.csv") data_file = sc.textFile("../2008_small.csv") header = data_file.first () raw_data = data_file.filter (lambda x:x != header) #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = LinearRegressionWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() testData.cache() # Evaluating the model on training data valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Total Time: '), (datetime.now() - startTime) print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "LinearRegressionNarrow2008_cache_both_train_and_test") sameModel = LinearRegressionModel.load(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
numIters = 500 # iterations alpha = 1.0 # step miniBatchFrac = 1.0 # miniBatchFraction reg = 1e-1 # regParam regType = 'l2' # regType useIntercept = True # intercept # In[62]: # TODO: Replace <FILL IN> with appropriate code firstModel = LinearRegressionWithSGD.train(parsedTrainData, iterations=numIters, step=alpha, miniBatchFraction=miniBatchFrac, initialWeights=None, regParam=reg, regType=regType, intercept=useIntercept ) # weightsLR1 stores the model weights; interceptLR1 stores the model intercept weightsLR1 = firstModel.weights interceptLR1 = firstModel.intercept print weightsLR1, interceptLR1 # In[63]: # TEST LinearRegressionWithSGD (4a) expectedIntercept = 13.3335907631
standardizer = StandardScaler() model = standardizer.fit(features) features_transform = model.transform(features) print features_transform.take(5) lab = df.map(lambda row: row[0]) transformedData = lab.zip(features_transform) transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]])) trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002) print linearModel.weights print testingData.take(10) print linearModel.predict([5.20814108601,42.4568179338,0.443700296128,6.20889144381,58.6223297308]) #actual 54.022 #score the model of the training data prediObserRddIn = trainingData.map(lambda row: (float(linearModel.predict(row.features[0])), row.label)) metrics = RegressionMetrics(prediObserRddIn) print metrics.r2 print metrics.rootMeanSquaredError #predict on the testing data prediObserRddOut = testingData.map(lambda row: (float(linearModel.predict(row.features[0])), row.label)) metricsOut = RegressionMetrics(prediObserRddOut)
#Section 7.4.6 from pyspark.mllib.feature import StandardScaler scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features)) trainLabel = housingTrain.map(lambda x: x.label) trainFeatures = housingTrain.map(lambda x: x.features) validLabel = housingValid.map(lambda x: x.label) validFeatures = housingValid.map(lambda x: x.features) trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) #Section 7.5 from pyspark.mllib.regression import LinearRegressionWithSGD alg = LinearRegressionWithSGD() trainScaled.cache() validScaled.cache() model = alg.train(trainScaled, iterations=200, intercept=True) #Section 7.5.1 validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label)) validPredicts.collect() import math RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) #Section 7.5.2 from pyspark.mllib.evaluation import RegressionMetrics validMetrics = RegressionMetrics(validPredicts) validMetrics.rootMeanSquaredError validMetrics.meanSquaredError #Section 7.5.3 import operator
# In[77]: from pyspark.mllib.regression import LinearRegressionWithSGD # Values to use when training the linear regression model numIters = 500 # iterations alpha = 1.0 # step miniBatchFrac = 1.0 # miniBatchFraction reg = 1e-1 # regParam regType = 'l2' # regType useIntercept = True # intercept # In[79]: # TODO: Replace <FILL IN> with appropriate code firstModel = LinearRegressionWithSGD.train(parsedTrainData, numIters, alpha, miniBatchFrac, None, reg, regType, useIntercept) # weightsLR1 stores the model weights; interceptLR1 stores the model intercept weightsLR1 = firstModel.weights interceptLR1 = firstModel.intercept print weightsLR1, interceptLR1 # In[80]: # TEST LinearRegressionWithSGD (4a) expectedIntercept = 13.3335907631 expectedWeights = [16.682292427, 14.7439059559, -0.0935105608897, 6.22080088829, 4.01454261926, -3.30214858535, 11.0403027232, 2.67190962854, 7.18925791279, 4.46093254586, 8.14950409475, 2.75135810882] Test.assertTrue(np.allclose(interceptLR1, expectedIntercept), 'incorrect value for interceptLR1') Test.assertTrue(np.allclose(weightsLR1, expectedWeights), 'incorrect value for weightsLR1')
if __name__ == "__main__": sc = SparkContext(appName="Regression Metrics Example") # $example on$ # Load and parse the data def parsePoint(line): values = line.split() return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]])) data = sc.textFile("data/mllib/sample_linear_regression_data.txt") parsedData = data.map(parsePoint) # Build the model model = LinearRegressionWithSGD.train(parsedData) # Get predictions valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label)) # Instantiate metrics object metrics = RegressionMetrics(valuesAndPreds) # Squared Error print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) # R-squared print("R-squared = %s" % metrics.r2) # Mean absolute error
print parsedData.take(3) # In[58]: #Devide rawData into Traning, Validation and Test weights = [.8, .1, .1] seed = 50 parsedTrainData, parsedValData, parsedTestData = parsedData.randomSplit(weights, seed) # In[64]: # Fit the model with default values fitModel = LinearRegressionWithSGD.train(parsedTrainData) print fitModel # In[65]: # Prediction testPoint = parsedTrainData.take(1)[0] print testPoint.label testPrediction = fitModel.predict(testPoint.features) print samplePrediction