Exemple #1
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
def main():
    #reading test and train data
    trainData = sc.pickleFile(input +
                              '/Train_data_unnormalized.pickle/part-00000')
    testData = sc.pickleFile(input +
                             '/Test_data_unnormalized.pickle/part-00000')
    parsedData = trainData.map(parseInput).filter(
        lambda line: len(line.features) != 0 or len(line.label) != 0)
    parsedTestData = testData.map(parseInput).filter(
        lambda line: len(line.features) != 0 or len(line.label) != 0)

    numIterations = 100
    stepSize = [0.1, 10, 20]
    BestError = 1000000

    BestStep = 0
    BestSplit = []
    splits = [[1, 2], [1, 3]]

    #Cross Validation
    for x in stepSize:
        for y in splits:
            (Train_RDD, Valid_RDD) = trainData.randomSplit(y, 10L)
            parsed_input = Train_RDD.map(parseInput).filter(
                lambda line: len(line.features) != 0 or len(line.label) != 0)
            parsed_valid = Valid_RDD.map(parseInput).filter(
                lambda line: len(line.features) != 0 or len(line.label) != 0)
            try:
                model = LinearRegressionWithSGD.train(parsed_input,
                                                      iterations=numIterations,
                                                      step=x)
                valuesAndPreds = parsed_valid.map(
                    lambda p: (p.label, model.predict(p.features)))
                MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(
                    lambda x, y: x + y) / valuesAndPreds.count()
                RMSE = math.sqrt(MSE)
            except Exception:
                pass
            if RMSE < BestError:
                BestError = RMSE
                BestStep = x
                BestSplit = y

    #Finding test error

    model = LinearRegressionWithSGD.train(parsedData,
                                          iterations=numIterations,
                                          step=BestStep)
    valuesAndPreds = parsedTestData.map(lambda p:
                                        (p.label, model.predict(p.features)))
    MSE_test = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(
        lambda x, y: x + y) / valuesAndPreds.count()
    RMSE_test = math.sqrt(MSE_test)

    print("Best Root Mean Squared Error Validation = " + str(BestError))
    print("Best Root Mean Squared Error Test= " + str(RMSE_test))
    print("Best StepSize = " + str(BestStep))
    print(BestSplit)
def regularized(trainingData, testData, trainingSize, testSize, regTypeVal):
    '''
  Least square with l1 norm: lasso
  '''
    # train a lr model
    numIterValList = [3000, 5000, 10000]
    stepSizeValList = [1e-11, 1e-9, 1e-7]
    regParamValList = [0.01, 0.1, 1, 10]

    # variable for the best parameters
    bestNumIterVal = 200
    bestStepSizeVal = 1
    bestTrainingRMSE = 1e10
    bestRegParamVal = 0.0

    for numIterVal, stepSizeVal, regParamVal in itertools.product(
            numIterValList, stepSizeValList, regParamValList):
        model = LinearRegressionWithSGD.train(trainingData,
                                              iterations=numIterVal,
                                              step=stepSizeVal,
                                              regParam=regParamVal,
                                              regType=regTypeVal)
        ValsAndPreds = trainingData.map(lambda p:
                                        (p.label, model.predict(p.features)))
        trainingRMSE = math.sqrt(
            ValsAndPreds.map(lambda (v, p):
                             (v - p)**2).reduce(lambda x, y: x + y) /
            trainingSize)
        if trainingRMSE:
            if trainingRMSE < bestTrainingRMSE:
                bestNumIterVal = numIterVal
                bestStepSizeVal = stepSizeVal
                bestTrainingRMSE = trainingRMSE
        print numIterVal, stepSizeVal, regParamVal, trainingRMSE
    print bestNumIterVal, bestStepSizeVal, bestRegParamVal, bestTrainingRMSE

    model = LinearRegressionWithSGD.train(trainingData,
                                          iterations=bestNumIterVal,
                                          step=bestStepSizeVal,
                                          regParam=regParamVal,
                                          regType=regTypeVal)

    # Evaluating the model on training data
    ValsAndPreds = trainingData.map(lambda p:
                                    (p.label, model.predict(p.features)))
    trainingRMSE = math.sqrt(
        ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)
        / trainingSize)
    print trainingRMSE

    # Evaluating the model on training data
    ValsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
    testRMSE = math.sqrt(
        ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)
        / testSize)
    print testRMSE
    pass
def hadamard_fit(data):
    # sample 1024 terms from data
    parsedData = data.map(lambda line: np.array([float(x) for x in line.split(',')]))
    rdd3 = sc.parallelize(parsedData.takeSample(True, 1024),2)

    # create Hadamard matrix
    N = 10
    H = np.zeros([1024, 1024])
    H[0, 0] = 1
    h = 1
    for i in range(N):
        H[0:h, h:2 * h] = H[0:h, 0:h]
        H[h:2 * h, 0:h] = H[0:h, 0:h]
        H[h:2 * h, h:2 * h] = -1 * H[0:h, 0:h]
        h = h * 2

    # multiply with Hadamard matrix
    lens = rdd3.collect()[0].shape[0]
    X_array = np.array(rdd3.collect()).reshape(1024, lens)
    X_hadamard = H.dot(X_array)

    x_rdd = sc.parallelize(X_hadamard)  # each entry is an numpy array
    subset = x_rdd.map(lambda x: LabeledPoint(x[-1], x[0:lens - 1])) \
        .randomSplit([0.8, 0.2])  # split training and testing
    x_rp = subset[0].filter(mat_B_filter)  # mat B actually serve as a filter
    model3 = LinearRegressionWithSGD.train(x_rp, iterations=100,
                                           step=0.00000001, regType=None)
    # Evaluate the model on training data
    valuesAndPreds = subset[1].map(lambda p: (p.label, model3.predict(p.features)))
    MSE = valuesAndPreds \
              .map(lambda vp: (vp[0] - vp[1]) ** 2) \
              .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
def regression():
    #Regression Point
    #Reads the data from the joinedResults directory as a parquet file
    datadf = sqlContext.read.parquet(output+"/joinedResults")
    datadf.show()
    data = datadf.rdd.map(lambda w: (float(w.avg_prcp), int(w.yy), float(w.latitude), float(w.longitude)))
    max_prcp = data.max()
    min_prcp = data.min()
    lat = data.map(lambda x: (x[2])).cache()
    min_lat = lat.min()
    max_lat = lat.max()

    longt =  data.map(lambda x: (x[3])).cache()
    min_long = longt.min()
    max_long = longt.max()
    
    max_ = [max_prcp[0], float(2050), max_lat, max_long]
    min_ = [min_prcp[0], float(1990), min_lat, min_long]
    # change the format to fit in LinearRegression library
    parsedData = data.map(lambda x: parsePointPrediction(x, max_, min_)).cache()
    # Split data aproximately into training (80%) and test (20%)
    trainData, testData = parsedData.randomSplit([0.8, 0.2], seed = 0)
    trainData.cache()
    testData.cache()
    # Build the model using Try and error to find out the Parameters.
    model = LinearRegressionWithSGD.train(trainData, iterations =500, regType="l2", regParam=10, intercept="true"  )
    # Evaluate the model on test data
    valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
    maxVal=max_prcp[0]

    model.save(sc, output+"/modelpath")
    return
Exemple #6
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
def main(sc):
    # Loading the features:
    features_cr = sc.pickleFile('/tmp/features_saved')
    print(features_cr.first())

    # Getting the features ready for training
    numberFeatures = len(features_cr.first()) - 1
    mappings = [get_mapping(features_cr, i) for i in range(0, numberFeatures)]

    # Working with the Mapping:
    # Month:
    dictio_month = {}
    for i in range(12):
        dictio_month[i + 1] = i
    mappings[1] = dictio_month
    # Year: ?

    cat_len = sum(map(len, mappings))
    data = features_cr.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))
    print(features_cr.first())
    # Regression:
    linear_model = LinearRegressionWithSGD.train(data,
                                                 iterations=100,
                                                 step=0.25,
                                                 intercept=False)

    linear_model.save(sc, '/tmp/linear_model')
    print 'OK model'
Exemple #8
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
Exemple #9
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
Exemple #10
0
def main():

    spark = SparkSession.builder.appName("TRAFFIC").config(
        "spark.executor.cores", "6").config("spark.executor.memory",
                                            "6g").getOrCreate()
    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    raw_data = sc.textFile("s3a://insighttraffic/dot_traffic_2015.txt")
    header = raw_data.first()
    records = raw_data.filter(lambda line: line != header).map(
        lambda x: x.split(","))
    records.cache()

    mappings = [get_mapping(records, i) for i in range(1, 11)]
    category_len = reduce(lambda x, y: x + y, map(len, mappings))
    boto3.resource('s3').Object('insighttraffic',
                                'ML_model/mappings').put(Body=str(mappings))

    for hour in range(0, 24):
        data_log = records.map(lambda r: LabeledPoint(
            extract_label(r, hour + 13),
            extract_features(r, hour, category_len, mappings))
                               )  #log transformed data
        linear_model_log = LinearRegressionWithSGD.train(data_log,
                                                         iterations=100,
                                                         step=0.01,
                                                         intercept=True)
        linear_model_log.save(
            sc, "s3a://insighttraffic/ML_model/linear_model_log_" + str(hour))

    sc.stop()
Exemple #11
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
                RidgeRegressionWithSGD
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)
 def get_model(self, dataf, num_iter, step_size, mini_batch_frction):
     model = LinearRegressionWithSGD.train(
         dataf,
         iterations=num_iter,
         step=step_size,
         miniBatchFraction=mini_batch_frction)
     return model
Exemple #13
0
def get_best_stepsize(step_sizes, training_lp, iterations, cv_trails):
    best_stepsize = 0
    lowest_RMSE = float("inf")
    num_folds = 4
    fold_set = [1] * num_folds
    cv_data = training_lp.randomSplit(fold_set)  # 4 folds
    for step_size in step_sizes:
        total_RMSE = 0.0
        for i in range(num_folds):
            cv_testing = cv_data[i]
            cv_training = training_lp.subtract(cv_testing)
            model = LinearRegressionWithSGD.train(cv_training,
                                                  iterations=iterations,
                                                  step=step_size)
            values_and_preds = cv_testing.map(
                lambda p: (p.label, model.predict(p.features)))
            MSE = values_and_preds.map(lambda (v, p): (v - p)**2).reduce(
                operator.add)
            RMSE = math.sqrt(MSE)
            total_RMSE += RMSE
        avg_RMSE = total_RMSE / cv_trails
        if avg_RMSE < lowest_RMSE:
            lowest_RMSE = avg_RMSE
            best_stepsize = step_size

    return best_stepsize
Exemple #14
0
def train_model(data, rdd):
    """
    分别使用scikit-learn和Spark MLlib训练模型
    """
    sklearn_model = sklearnLR()
    sklearn_model.fit(data[:, 1:], data[:, 0])
    mllib_model = LinearRegressionWithSGD.train(rdd, intercept=True)
    return sklearn_model, mllib_model
Exemple #15
0
def evaulate(train, test, iterations, step, regParam, regType, intercept):
    model = LinearRegressionWithSGD.train(train,
                                          iterations=iterations,
                                          step=float(step),
                                          intercept=intercept)
    tp = test.map(lambda p: (p.label, model.predict(p.features)))
    rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean())
    return rmsle
def get_best_result(best_step_size, training_lp, testing_lp, iterations):
    model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=best_step_size, regType = 'l2')
    values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features)))
    MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
    RMSE = math.sqrt(MSE)

    result_str = 'best step size got by cross validation cv: ' + str(best_step_size) + ', lowest RMSE: ' + str(RMSE)
    return result_str
def evaluate(train,test,iterations,step,regParam,regType,intercept):
    model = LinearRegressionWithSGD.train(train, iterations, step,regParam=regParam, regType=regType, intercept=intercept)
    tp = test.map(lambda p: (p.label, model.predict(p.features)))
    rmse = np.sqrt(tp.map(lambda (t,p): squarred_error(t,p)).mean())
    mae = np.sqrt(tp.map(lambda (t,p): abs_error(t,p)).mean())
    rmsle = np.sqrt(true_vs_predicted.map(lambda (t,p): squared_log_error(t,p)).mean())
    opt_metrics = [rmse,mae,rmsle] 
    return opt_metrics
def Regression_Model(filename):
    open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data(
        filename)
    output = []
    for i in range(1, len(Date)):
        tmp = LabeledPoint(label=True_price_train[i],
                           features=[close_price_train[i]])
        output.append(tmp)

    output_train_RDD = sc.parallelize(output).cache()
    lrm = LinearRegressionWithSGD.train(output_train_RDD,
                                        step=0.001,
                                        iterations=100000)
    tree = DecisionTree.trainRegressor(output_train_RDD,
                                       categoricalFeaturesInfo={},
                                       impurity='variance',
                                       maxDepth=5,
                                       maxBins=30)
    forest = RandomForest.trainRegressor(output_train_RDD,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='variance',
                                         maxDepth=5,
                                         maxBins=30)
    gradient = GradientBoostedTrees.trainRegressor(output_train_RDD,
                                                   categoricalFeaturesInfo={},
                                                   numIterations=10)

    print("\n============MODEL Evaluation=============\n")
    model_name = [
        'LinearRegression', 'DecisionTree', 'RandomForest',
        'GradientBoostedTrees'
    ]
    es_modelname = ['lrm', 'tree', 'forest', 'gradient']
    result = ''
    x = 0
    err = 1000
    test_model = 'LinearRegression'
    #此处更换不同的RDD
    output_model_RDD = lrm
    for model in [lrm, tree, forest, gradient]:
        predictions = model.predict(output_train_RDD.map(lambda x: x.features))
        labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip(
            predictions)
        MSE = (
            labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /
            float(output_train_RDD.count()))**0.5
        #print ("Predictions: ", valuesAndPreds.take(10))
        result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n"
        if (err > MSE):
            err = MSE
            output_model = model
            es_model = es_modelname[x]
        x += 1
    print(result)
    print(es_model)
    return Date, True_price, output_model_RDD, open_price, close_price, es_model
Exemple #19
0
def getRMSE(step_array):
	valRMSE_list = []
	for step in step_array:
		model = LinearRegressionWithSGD.train(train_featureScoreTimeRDD, iterations=5000, step=step)
		labelsAndPreds = val_featureScoreTimeRDD.map(lambda p: (p.label, model.predict(p.features)))
		valMSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / val_featureScoreTimeRDD.count()
		valRMSE=valMSE**0.5
		valRMSE_list.append((step, valRMSE))
	return valRMSE_list
 def get_model_weight(self, dataf, weights, num_iter, step_size,
                      mini_batch_frction):
     model = LinearRegressionWithSGD.train(
         dataf,
         iterations=num_iter,
         step=step_size,
         miniBatchFraction=mini_batch_frction,
         initialWeights=weights)
     return model
 def evaluate(train_set, iterations, step, reg_param, reg_type, intercept):
     # create linear model using Stochastic gradient descent(随机梯度下降)
     model = LinearRegressionWithSGD.train(train_set, iterations, step, regParam=reg_param, regType=reg_type,
                                           intercept=intercept)
     # use test data -> rdd: [(actual_value, prdict_value), (...), (...), ......]
     tlabel_tprediction = train_set.map(lambda point: (point.label, model.predict(point.features)))
     # calculate Root Mean Squared Log Error
     rmsle = np.sqrt(tlabel_tprediction.map(lambda tp: squared_log_error(tp[0], tp[1])).mean())
     return rmsle
Exemple #22
0
def rmse_mae_gd(trainset, testset):
	#Stochastic gradient descent with l1
	model_sgd_l1 = LinearRegressionWithSGD.train(trainset, miniBatchFraction = 0.00001, regParam=0.1, regType= 'l1', iterations=50, step=0.00000001)
	predicted = testset.map(lambda p: (p.label, model_sgd_l1.predict(p.features)))
	RMSE_l1 = sqrt(predicted.map(lambda vp: (vp[0] - vp[1])**2).reduce(lambda x, y: x + y) / predicted.count())
	MAE_l1 = predicted.map(lambda vp: abs(vp[0] - vp[1])).reduce(lambda x, y: x + y) / predicted.count()
	mean_RMSE.append(RMSE_l1)
	print ("Root Mean Squared Error for Stochastic Gradient Descent with l1: " + str(RMSE_l1))
	print ("Mean Absolute Error for Stochastic Gradient Descent with l1: " + str(MAE_l1))
def linearRegression(features,sc,output_n):
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:116]
	
	
	linearregression_model = LinearRegressionWithSGD.train(training_data,iterations=0,regParam=200)
	prediction = testing_data.map(lambda line: (line.label, linearregression_model.predict(line.features)))
	return linearregression_model,prediction
def rr_fit(parsed_Data):
    rdd = parsed_Data.randomSplit([0.8, 0.2])
    model = LinearRegressionWithSGD.train(rdd[0], iterations=100,
                                          step=0.00000001, regType="l2")

    # Evaluate the model on training data
    valuesAndPreds = rdd[1].map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)\
              .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
Exemple #25
0
def train_regression(data):
    model = LinearRegressionWithSGD.train(data,
                                          iterations=100,
                                          step=0.00000001)
    valuesAndPreds = data.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds \
              .map(lambda (v, p): (v - p) ** 2) \
              .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
    return model
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca):

    pca_n = 2
    sc = SparkContext(master)
    data = sc.textFile(dataPath)

# not RDD data 

    ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))

    if label == 0:
        ndata = ndata.map(lambda line: line[::-1])

    if normalize == 1:
        test_data = norm(ndata.collect())    
        norm_data = sc.parallelize(test_data)
        train_data = norm_data.map(lambda part: lbp(part[0], part[1]))   
     #raw_data = data.map(lambda line: line.split(character))


    else:
        test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect()
        train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1]))
    
    
    if ispca == 1:
        pca = PCA(n_components = pca_n)
        pca_train = [test_data[i][1] for i in range(len(test_data))]
        pca_data = pca.fit(pca_train).transform(pca_train)

        test = []
        for i in range(len(pca_data)):
            test.append([test_data[i][0], pca_data[i]])

        train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
        test_data = test
            

    model_lr = lr.train(train_data)
    err_lr = 0.0
    size = len(train_data.collect())
   
    for i in range(size):
        err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0])
           

    print "result:", err_lr/size

    String = "Linear Regression Result:\n"
    String = String + str(model_lr.weights) + '\n'
    String = String + "Error: " + str(err_lr / size) 
    
    sc.stop()

    return String
Exemple #27
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Exemple #28
0
def trainModel(data, rdd):
    """
    分别使用scikit-learn和Spark MLlib训练模型
    """
    sklearnModel = sklearnLR()
    sklearnModel.fit(data[:, 1:], data[:, 0])
    # 调整超参数
    mllibModel = LinearRegressionWithSGD.train(
        rdd, intercept=True, iterations=1000, miniBatchFraction=0.1,
        step=5, convergenceTol=1e-7)
    return sklearnModel, mllibModel
Exemple #29
0
def evaluate(train, test, iterations, step, regParam, regType, intercept):
    model = LinearRegressionWithSGD.train(train,
                                          iterations,
                                          step,
                                          regParam=regParam,
                                          regType=regType,
                                          intercept=intercept)
    _tp = test.map(lambda p: (p.label, model.predict(p.features)))
    _rmsle = np.sqrt(
        _tp.map(lambda tp: squared_log_error(tp[0], tp[1])).mean())
    return _rmsle
def iterateLRwSGDBatch(iterNums, stepSizes, fractions, train, valid):
  for numIter in iterNums:
    for step in stepSizes:
      for miniBFraction in fractions:
        alg = LinearRegressionWithSGD()
        model = alg.train(train, intercept=True, iterations=numIter, step=step, miniBatchFraction=miniBFraction)
        rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label))
        validPredicts = valid.map(lambda x: (model.predict(x.features), x.label))
        meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
        meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
        print("%d, %5.3f %5.3f -> %.4f, %.4f" % (numIter, step, miniBFraction, meanSquared, meanSquaredValid))
Exemple #31
0
def evaluate(train, test, iterations, step, regParam, regType, intercept):
    lrModel = LinearRegressionWithSGD.train(train,
                                            iterations,
                                            step,
                                            regParam=regParam,
                                            regType=regType,
                                            intercept=intercept)
    # weights of lr model
    # lrModel.weights
    actual_vs_pred = test.map(lambda p: (p.label, lrModel.predict(p.features)))
    #print actual_vs_pred.take(10)
    actual_pred_error(actual_vs_pred)
def linearRegression(features, sc, output_n):
    features_and_label = features.collect()
    training_features_labels = features_and_label[0:70]

    testing_features_labels = features_and_label[70:116]

    linearregression_model = LinearRegressionWithSGD.train(training_data,
                                                           iterations=0,
                                                           regParam=200)
    prediction = testing_data.map(lambda line: (
        line.label, linearregression_model.predict(line.features)))
    return linearregression_model, prediction
def iterateLRwSGD(iterNums, stepSizes, train, valid):
  from pyspark.mllib.regression import LinearRegressionWithSGD
  import math
  for numIter in iterNums:
    for step in stepSizes:
      alg = LinearRegressionWithSGD()
      model = alg.train(train, iterations=numIter, step=step, intercept=True)
      rescaledPredicts = train.map(lambda x: (float(model.predict(x.features)), x.label))
      validPredicts = valid.map(lambda x: (float(model.predict(x.features)), x.label))
      meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
Exemple #34
0
def get_best_result(best_step_size, training_lp, testing_lp, iterations):
    model = LinearRegressionWithSGD.train(training_lp,
                                          iterations=iterations,
                                          step=best_step_size)
    values_and_preds = testing_lp.map(lambda p:
                                      (p.label, model.predict(p.features)))
    MSE = values_and_preds.map(lambda (v, p): (v - p)**2).reduce(operator.add)
    RMSE = math.sqrt(MSE)

    result_str = 'best step size got by cross validation cv: ' + str(
        best_step_size) + ', lowest RMSE: ' + str(RMSE)
    return result_str
Exemple #35
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Exemple #36
0
def rmse_mae_gd(trainset, testset):
    model_gd = LinearRegressionWithSGD.train(trainset,
                                             iterations=50,
                                             step=0.00000001)
    predicted = testset.map(lambda p: (p.label, model_gd.predict(p.features)))
    RMSE = sqrt(
        predicted.map(lambda vp: (vp[0] - vp[1])**2).reduce(lambda x, y: x + y)
        / predicted.count())
    MAE = predicted.map(lambda vp: abs(vp[0] - vp[1])).reduce(
        lambda x, y: x + y) / predicted.count()
    mean_RMSE.append(RMSE)
    print("Root Mean Squared Error for Gradient Descent: " + str(RMSE))
    print("Mean Absolute Error for Gradient Descent: " + str(MAE))
def main():
    records = get_records()
    mappings = [get_mapping(records, i) for i in range(2,10)]

    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings)))
    data_log = data.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features))
    model_log = LinearRegressionWithSGD.train(data_log, iterations=10, step=0.1)
    true_vs_predicted_log = data_log.map(lambda p: (np.exp(p.label), np.exp(model_log.predict(p.features))))
    calculate_print_metrics("Linear Regression Log", true_vs_predicted_log)
def xRMSerror (parsedDataTrain,parsedDataTest):

     numIterations = 1000
     stepsize=0

     model = LinearRegressionWithSGD.train(parsedDataTrain,numIterations,stepsize)


     # Evaluate the model on training data
     valuesAndPreds = parsedDataTest.map(lambda p: (p.label, model.predict(p.features)))
     print valuesAndPreds.take(5)
     MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
     return math.sqrt(MSE)
def get_best_stepsize(step_sizes, training_lp, testing_lp, iterations):
    best_stepsize = 0
    lowest_RMSE = float("inf")
    for step_size in step_sizes:
        model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=step_size)
        values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features)))
        MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
        RMSE = math.sqrt(MSE)
        if RMSE < lowest_RMSE:
            lowest_RMSE = RMSE
            best_stepsize = step_size

    result_str = 'best step size: ' + str(best_stepsize) + ', lowest RMSE: ' + str(lowest_RMSE)
    return result_str
def get_best_stepsize(step_sizes, training_lp, testing_lp, iterations):
    best_stepsize = 0
    lowest_RMSE = float("inf")
    for step_size in step_sizes:
        model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=step_size)
        values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features)))
        MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
        RMSE = math.sqrt(MSE)
        if RMSE < lowest_RMSE:
            lowest_RMSE = RMSE
            best_stepsize = step_size

    result_str = 'best step size: ' + str(best_stepsize) + ', lowest RMSE: ' + str(lowest_RMSE)
    return result_str
def lr_example():
    min_freq = 1
    n_common = 10

    pwd = os.path.dirname(os.path.abspath(__file__))
    path = pwd + '/example_data/twitter_2020-03-10_slim.csv'
    print(path)
    df = csv_parser.load_as_df(path, twitter_schema)
    df.show(3)

    converted = featurizer.convert_df_to_feature(
        df, n_common, min_freq).filter(
            lambda row: row['age'] is not None and row['feature'] is not None)
    converted = converted.map(
        # (age, sex, feature)
        lambda row: LabeledPoint(row['age'], concat_vectors(row['feature'])))
    converted = converted.zipWithIndex()

    sample = converted.take(3)

    train_rdd = converted.filter(lambda x: x[1] % 2 == 0).map(lambda x: x[0])

    feature_dim = len(train_rdd.first().features)

    test_rdd = converted.filter(lambda x: x[1] % 2 == 1).map(lambda x: x[
        0]).filter(lambda x: len(x.features) == feature_dim).collect()

    print("confirming dim of train rdd")
    sample = train_rdd.take(3)
    for e in sample:
        print(e.features)
        print(len(e.features))

    lrm = LinearRegressionWithSGD.train(train_rdd)
    n = len(test_rdd)

    mse = 0
    # テスト
    for lp in test_rdd:
        gt = lp.label
        feat = lp.features
        pred = lrm.predict(feat)
        print(gt, pred)
        mse += (pred - gt) * (pred - gt)

    import math
    rmse = math.sqrt(mse / n)

    print('Root mean square error: ' + str(rmse))
def evaluate_lm(train_set,
                test_set,
                step,
                batch_pct,
                reg,
                reg_param,
                iterations=100):
    # Evalute the model on training data
    lm = LinearRegressionWithSGD.train(train_set, iterations=iterations, \
                                       step=step, miniBatchFraction=batch_pct,\
                                       regType=reg, regParam=reg_param,\
                                       intercept=True, validateData=False )

    values_and_preds = test_set.map(lambda x:
                                    (x.label, float(lm.predict(x.features))))
    return get_lr_evals(values_and_preds)
def LinearRegression(filename, sc):
	filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
	data = sc.textFile(filename)
	parsedData = data.map(parsePoint)

	# train the model
	model = LinearRegressionWithSGD.train(parsedData)

	# Evaluate the model on training data
	valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
	print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")

	# Save and load model
	#model.save(sc, "myModelPath")
	#sameModel = LinearRegressionModel.load(sc, "myModelPath")
Exemple #44
0
def test_spark():
    def parsePoint(line):
        values = [float(x) for x in line.replace(',', ' ').split(' ')]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile(r"/usr/local/Cellar/apache-spark/1.6.1/libexec/data/mllib/ridge-data/lpsa.data")
    parsedData = data.map(parsePoint)
    print parsedData.collect()

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData)

    # Evaluate the model on training data
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))
    print "Model coefficients:", str(model)
Exemple #45
0
def algo(a):
    global data
    global week 
    global target
    test = week 
    week_target = week.map(convert)
    #apply(convert, axis=1)
    #np.random.seed(123)
    data_final = LabeledPoint(target, data)
    #make rdd that is input for algo 


    if a == 'sgd':
        #time_0 = time.time()
        lrm = LinearRegressionWithSGD.train(sc.parallelize(data_final), iterations=10, initialWeights=np.array([1.0]))
        print (abs(lrm.predict(test)))
        print time.time() - time_0 
def linearRegression(features,sc,output_n):
	features_and_label = features.collect()
	training_features_labels = features_and_label[0:70]
	
	testing_features_labels = features_and_label[70:]

	labeled_training = []
	labeled_testing = []
	for x in training_features_labels:
		labeled_training.append(LabeledPoint(x[0],x[1]))

	for y in testing_features_labels:
		labeled_testing.append(LabeledPoint(y[0],y[1]))

	test = sc.parallelize(labeled_testing)

	linearregression_model = LinearRegressionWithSGD.train(labeled_training,iterations=0,regParam=200)
	predictions = test.map(lambda line: (line.label, float(linearregression_model.predict(line.features))))
	return predictions
def linearRegression_f(mode):
    if   mode == "no_reg":
         model = LinearRegressionWithSGD.train(parsedData)
    elif mode == "L1_reg":
         model = LassoWithSGD.train(parsedData)
    elif mode == "L2_reg":
         model = RidgeRegressionWithSGD.train(parsedData)
    else:
        print("ERROR Mode")
        
    #Evaluate the model on training data
    # parsedData map method to get {train_data, predict_data} pairs 
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    
    #calculate the key-value pairs to get MSE
    MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count()
    
  
    return MSE
def LinearRegression(trainFile, testFile, taskid,sc):
	# filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
	# data = sc.textFile(filename)
	# parsedData = data.map(parsePoint)

	trainData = MLUtils.loadLibSVMFile(sc, trainFile)
	testData = MLUtils.loadLibSVMFile(sc, testFile)

	# train the model
	model = LinearRegressionWithSGD.train(trainData)

	# Evaluate the model on training data
	# predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features)))
	MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count()
	print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")

	# Save and load model
	#model.save(sc, "myModelPath")
	#sameModel = LinearRegressionModel.load(sc, "myModelPath")
def get_best_stepsize(step_sizes, training_lp, iterations, cv_trails):
    best_stepsize = 0
    lowest_RMSE = float("inf")
    num_folds = 4
    fold_set = [1]*num_folds
    cv_data = training_lp.randomSplit(fold_set) # 4 folds
    for step_size in step_sizes:
        total_RMSE = 0.0
        for i in range(num_folds):
            cv_testing = cv_data[i]
            cv_training = training_lp.subtract(cv_testing)
            model = LinearRegressionWithSGD.train(cv_training, iterations=iterations, step=step_size)
            values_and_preds = cv_testing.map(lambda p: (p.label, model.predict(p.features)))
            MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
            RMSE = math.sqrt(MSE)
            total_RMSE += RMSE
        avg_RMSE = total_RMSE/cv_trails
        if avg_RMSE < lowest_RMSE:
            lowest_RMSE = avg_RMSE
            best_stepsize = step_size

    return best_stepsize
    def train_amount_model(self, model, data, i):
        rdd_data = self.sc.parallelize(data)
        self.logger.info('Start to train the amount model')
        if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK:
            input_num = self.feature_num
            layers = [input_num, input_num / 3 * 2, input_num / 3, 1]

            neural_network = NeuralNetworkSpark(layers=layers, bias=0)
            model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001,
                                         iteration=15, model=model)
        elif self.amount_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40,
                                                featureSubsetStrategy="auto", impurity='variance', maxDepth=20,
                                                maxBins=32)

        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                  initialWeights=model.weights if model is not None else None)

        else:
            self.logger.error("Unknown training method {}".format(self.amount_prediction_method))
            raise ValueError("Unknown training method {}".format(self.amount_prediction_method))
        return model
Exemple #51
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
def learn_model(sc, file_path, normalize):
	feature_file = sc.textFile(file_path).map(lambda l:l.split("\t"))

	points = feature_file.map(lambda f: LabeledPoint(f[1], f[2:]))
	
	#normalizing
	if normalize:
		nor      = Normalizer()
		labels   = points.map(lambda x: x.label)
		features = points.map(lambda x: x.features)
		points = labels.zip(nor.transform(features))
		points = points.map(lambda i: LabeledPoint(i[0], i[1]))

	training, testing = points.randomSplit([0.7,0.3],11)
	index = 0
	iterations = 100
	p_mse = -1
	converge = False
	result = {}
	while(not converge):
		x = time.clock()
		model = LinearRegressionWithSGD.train(training, iterations=iterations, step=0.00001,intercept=True,regType="l1")
		y = time.clock()
		print("========== time = " + str(y - x))
		preds = testing.map(lambda p: (p.label, model.predict(p.features)))
		MSE = preds.map(lambda r: (r[1] - r[0])**2).reduce(lambda x, y: x + y) / preds.count()
		print("========== MSE = " + str(MSE))
		if p_mse == MSE:
			converge = True

		iterations = iterations +100
		result[iterations] = MSE
		p_mse = MSE
	
	print(result)
	return model
	return LabeledPoint(values[7], values[0:11]) 

#data_file = sc.textFile("/home/faiz89/Desktop/Eastman/2008.csv")
data_file = sc.textFile("../2008_small.csv")
header = data_file.first ()
raw_data = data_file.filter (lambda x:x != header)

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = LinearRegressionWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()
testData.cache()
# Evaluating the model on training data
valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds \
    .map(lambda (v, p): (v - p)**2) \
    .reduce(lambda x, y: x + y) / valuesAndPreds.count()
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Total Time: '), (datetime.now() - startTime)

print("Mean Squared Error = " + str(MSE))
# Save and load model
model.save(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
sameModel = LinearRegressionModel.load(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept


# In[62]:

# TODO: Replace <FILL IN> with appropriate code
firstModel = LinearRegressionWithSGD.train(parsedTrainData, 
                                           iterations=numIters, 
                                           step=alpha, 
                                           miniBatchFraction=miniBatchFrac, 
                                           initialWeights=None, 
                                           regParam=reg, 
                                           regType=regType, 
                                           intercept=useIntercept 
                                           )

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print weightsLR1, interceptLR1


# In[63]:

# TEST LinearRegressionWithSGD (4a)
expectedIntercept = 13.3335907631
Exemple #55
0
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
print features_transform.take(5)

lab = df.map(lambda row: row[0])

transformedData = lab.zip(features_transform)

transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]]))

trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
print linearModel.weights

print testingData.take(10)

print linearModel.predict([5.20814108601,42.4568179338,0.443700296128,6.20889144381,58.6223297308]) #actual 54.022

#score the model of the training data
prediObserRddIn = trainingData.map(lambda row: (float(linearModel.predict(row.features[0])), row.label))
metrics = RegressionMetrics(prediObserRddIn)
print metrics.r2
print metrics.rootMeanSquaredError

#predict on the testing data
prediObserRddOut = testingData.map(lambda row: (float(linearModel.predict(row.features[0])), row.label))
metricsOut = RegressionMetrics(prediObserRddOut)
#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))

#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)

#Section 7.5.1
validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label))
validPredicts.collect()
import math
RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())

#Section 7.5.2
from pyspark.mllib.evaluation import RegressionMetrics
validMetrics = RegressionMetrics(validPredicts)
validMetrics.rootMeanSquaredError
validMetrics.meanSquaredError

#Section 7.5.3
import operator
# In[77]:

from pyspark.mllib.regression import LinearRegressionWithSGD
# Values to use when training the linear regression model
numIters = 500  # iterations
alpha = 1.0  # step
miniBatchFrac = 1.0  # miniBatchFraction
reg = 1e-1  # regParam
regType = 'l2'  # regType
useIntercept = True  # intercept


# In[79]:

# TODO: Replace <FILL IN> with appropriate code
firstModel = LinearRegressionWithSGD.train(parsedTrainData, numIters, alpha, miniBatchFrac, None, reg, regType, useIntercept)

# weightsLR1 stores the model weights; interceptLR1 stores the model intercept
weightsLR1 = firstModel.weights
interceptLR1 = firstModel.intercept
print weightsLR1, interceptLR1


# In[80]:

# TEST LinearRegressionWithSGD (4a)
expectedIntercept = 13.3335907631
expectedWeights = [16.682292427, 14.7439059559, -0.0935105608897, 6.22080088829, 4.01454261926, -3.30214858535,
                   11.0403027232, 2.67190962854, 7.18925791279, 4.46093254586, 8.14950409475, 2.75135810882]
Test.assertTrue(np.allclose(interceptLR1, expectedIntercept), 'incorrect value for interceptLR1')
Test.assertTrue(np.allclose(weightsLR1, expectedWeights), 'incorrect value for weightsLR1')
if __name__ == "__main__":
    sc = SparkContext(appName="Regression Metrics Example")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = line.split()
        return LabeledPoint(float(values[0]),
                            DenseVector([float(x.split(':')[1]) for x in values[1:]]))

    data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
    parsedData = data.map(parsePoint)

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData)

    # Get predictions
    valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))

    # Instantiate metrics object
    metrics = RegressionMetrics(valuesAndPreds)

    # Squared Error
    print("MSE = %s" % metrics.meanSquaredError)
    print("RMSE = %s" % metrics.rootMeanSquaredError)

    # R-squared
    print("R-squared = %s" % metrics.r2)

    # Mean absolute error
Exemple #59
0
print parsedData.take(3)


# In[58]:

#Devide rawData into Traning, Validation and Test
weights = [.8, .1, .1]
seed = 50
parsedTrainData, parsedValData, parsedTestData = parsedData.randomSplit(weights, seed)


# In[64]:

# Fit the model with default values
fitModel = LinearRegressionWithSGD.train(parsedTrainData)
print  fitModel


# In[65]:

# Prediction 
testPoint = parsedTrainData.take(1)[0]

print testPoint.label

testPrediction = fitModel.predict(testPoint.features)

print samplePrediction