Exemple #1
0
def cross_validation_gb(Data_1,Data_2,Data_3,loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train_1 = GradientBoostedTrees.trainRegressor(Data_1.union(Data_2), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features))
    labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1)
    testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_3.count())

    model_train_2 = GradientBoostedTrees.trainRegressor(Data_2.union(Data_3), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features))
    labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2)
    testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_1.count())

    model_train_3 = GradientBoostedTrees.trainRegressor(Data_3.union(Data_1), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features))
    labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3)
    testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_2.count())

    return (testMSE_1+testMSE_2+testMSE_3)/3
Exemple #2
0
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Creating a list of features to be used for predictions
    removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")

    transformed_final= assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
Exemple #3
0
def main(sc, sql_context, is_hive=True):
    lp_train = MLUtils.loadLabeledPoints(sc,
                                         "bintrade.ml.diff.label_point.train")
    lp_check = MLUtils.loadLabeledPoints(sc,
                                         "bintrade.ml.diff.label_point.check")

    model = GradientBoostedTrees.trainRegressor(lp_train, {},
                                                numIterations=50,
                                                maxDepth=10)

    preds = model.predict(lp_check.map(lambda x: x.features))
    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=False)

    for each in labels_and_preds.take(100):
        print each

    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=True)
    for each in labels_and_preds.take(100):
        print each

    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count()
    print mse
    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count()
    print mse
Exemple #4
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
Exemple #5
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
        except ValueError:
            self.fail()
def Regression_Model(filename):
    open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data(
        filename)
    output = []
    for i in range(1, len(Date)):
        tmp = LabeledPoint(label=True_price_train[i],
                           features=[close_price_train[i]])
        output.append(tmp)

    output_train_RDD = sc.parallelize(output).cache()
    lrm = LinearRegressionWithSGD.train(output_train_RDD,
                                        step=0.001,
                                        iterations=100000)
    tree = DecisionTree.trainRegressor(output_train_RDD,
                                       categoricalFeaturesInfo={},
                                       impurity='variance',
                                       maxDepth=5,
                                       maxBins=30)
    forest = RandomForest.trainRegressor(output_train_RDD,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='variance',
                                         maxDepth=5,
                                         maxBins=30)
    gradient = GradientBoostedTrees.trainRegressor(output_train_RDD,
                                                   categoricalFeaturesInfo={},
                                                   numIterations=10)

    print("\n============MODEL Evaluation=============\n")
    model_name = [
        'LinearRegression', 'DecisionTree', 'RandomForest',
        'GradientBoostedTrees'
    ]
    es_modelname = ['lrm', 'tree', 'forest', 'gradient']
    result = ''
    x = 0
    err = 1000
    test_model = 'LinearRegression'
    #此处更换不同的RDD
    output_model_RDD = lrm
    for model in [lrm, tree, forest, gradient]:
        predictions = model.predict(output_train_RDD.map(lambda x: x.features))
        labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip(
            predictions)
        MSE = (
            labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /
            float(output_train_RDD.count()))**0.5
        #print ("Predictions: ", valuesAndPreds.take(10))
        result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n"
        if (err > MSE):
            err = MSE
            output_model = model
            es_model = es_modelname[x]
        x += 1
    print(result)
    print(es_model)
    return Date, True_price, output_model_RDD, open_price, close_price, es_model
Exemple #7
0
def cross_validation_gb(Data_1, Data_2, Data_3, loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train_1 = GradientBoostedTrees.trainRegressor(
        Data_1.union(Data_2),
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features))
    labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1)
    testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_3.count())

    model_train_2 = GradientBoostedTrees.trainRegressor(
        Data_2.union(Data_3),
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features))
    labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2)
    testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_1.count())

    model_train_3 = GradientBoostedTrees.trainRegressor(
        Data_3.union(Data_1),
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features))
    labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3)
    testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_2.count())

    return (testMSE_1 + testMSE_2 + testMSE_3) / 3
Exemple #8
0
def testRegression(trainingData, testData, model_path):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3, maxDepth=4)
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() / float(testData.count())
    print("Test Mean Squared Error = " + str(testMSE))
    print("Learned regression GBT model:")
    print(model.toDebugString())
    model.save(sc, model_path)
Exemple #9
0
def validation_gb(trainingData,testData, loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions = model_train.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    return testMSE
Exemple #10
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
def testRegression(trainingData, testData):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                                numIterations=30, maxDepth=4)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \
        / float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression ensemble model:')
    print(model.toDebugString())
Exemple #12
0
def main():
    #Reading train and test data
    trainData  = sc.pickleFile(input+'/Train_data.average/part-00000')
    testData = sc.pickleFile(input+'/Test_data.average/part-00000')
    parsedData=trainData.map(parseInput).filter(lambda line:len(line.features)!=0 or len(line.label)!=0)
    parsedTestData = testData.map(parseInput).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache()
    model = GradientBoostedTrees.trainRegressor(parsedData,categoricalFeaturesInfo={}, numIterations=1)
    predictions = model.predict(parsedTestData.map(lambda x: x.features))
    labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip(predictions)
    validationErr = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedTestData.count())
    parsedTestData.unpersist()
    RMSE=math.sqrt(validationErr)

    print("Root Mean Squared Error Test= " + str(RMSE))
Exemple #13
0
def validation_gb(trainingData, testData, loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(
        trainingData,
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions = model_train.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    return testMSE
Exemple #14
0
def testRegression(trainingData, testData):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={},
                                                numIterations=30,
                                                maxDepth=4)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \
        / float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression ensemble model:')
    print(model.toDebugString())
Exemple #15
0
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth):
    removelist_train = set(
        ['stars', 'business_id', 'bus_id', 'b_id', 'review_id', 'user_id'])
    newlist_train = [
        v for i, v in enumerate(train_data.columns)
        if v not in removelist_train
    ]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train,
                                      outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train = (transformed_train.select(
        "features",
        "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(
        sc.parallelize(data_train.collect(), 5),
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Creating a list of features to be used for predictions
    removelist_final = set(
        ['business_id', 'bus_id', 'b_id', 'review_id', 'user_id'])
    newlist_final = [
        v for i, v in enumerate(test_data.columns) if v not in removelist_final
    ]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,
                                      outputCol="features")

    transformed_final = assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(
        lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
Exemple #16
0
def main():
    records = get_records()
    first = records.first()
    records.cache()

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))
    first_point = data.first()

    gbt_model = GradientBoostedTrees.trainRegressor(data,
                                                    categoricalFeaturesInfo={},
                                                    numIterations=3)
    true_vs_predicted_gbt = data.map(lambda p:
                                     (p.label, gbt_model.predict(p.features)))

    predictions = gbt_model.predict(data.map(lambda x: x.features))
    labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions)
    print "GradientBoosted Trees predictions: " + str(
        labelsAndPredictions.take(5))

    mse = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(data.count())
    mae = labelsAndPredictions.map(lambda (v, p): np.abs(v - p)).sum() /\
        float(data.count())
    rmsle = labelsAndPredictions.map(lambda (v,p) :  ((np.log(p + 1) - np.log(v + 1))**2)).sum() /\
        float(data.count())
    print('Gradient Boosted Trees - Mean Squared Error = ' + str(mse))
    print('Gradient Boosted Trees - Mean Absolute Error = ' + str(mae))
    print('Gradient Boosted Trees - Mean Root Mean Squared Log Error = ' +
          str(rmsle))
Exemple #17
0
    def labelData(data):
        return data.map(lambda row: LabeledPoint(row[2], row[3:]))

    f = open('GradientBoostedTree_regression_evaluation.txt', 'w')

    training, test = labelData(data).randomSplit([0.8, 0.2])
    numTraining = training.count()
    numTest = test.count()

    def getPredictionsLabels(model, test):
        predictions = model.predict(test.map(lambda r: r.features))
        return predictions.zip(test.map(lambda r: r.label))

    def printMetrics(predictions_and_labels):
        metrics = RegressionMetrics(predictions_and_labels)
        f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance))
        f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError))
        f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError))
        f.write('Root Mean Squared Error:{0}\n'.format(
            metrics.rootMeanSquaredError))
        f.write('R^2 :{0}\n'.format(metrics.r2))

    model = GradientBoostedTrees.trainRegressor(training,
                                                categoricalFeaturesInfo={})
    f.write(model.toDebugString())
    predictions_and_labels = getPredictionsLabels(model, test)
    printMetrics(predictions_and_labels)

    f.close()
    sc.stop()
Exemple #18
0
def runmodel_spark(spark, train, test, modelname):
    newtrain = make_dataframe(chromosome, train)
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={},
                                                numIterations=30)
print(x)
print(y)

os.environ["SPARK_HOME"] = "/Users/alexsisu/programs/spark-1.6.0"
conf = SparkConf().setAppName("myapp").setMaster("local")
sc = SparkContext(conf=conf)

input_data = []
for (xx, yy) in zip(x, y):
    lp = LabeledPoint(xx, [yy])
    input_data.append(lp)

training_data = sc.parallelize(input_data).cache()
test_data_rdd = sc.parallelize(input_data).cache()

classificationModel = GradientBoostedTrees.trainRegressor(
    training_data, categoricalFeaturesInfo={}, numIterations=100, maxDepth=10)
result = classificationModel.predict(test_data_rdd.map(lambda x: x.features))

print classificationModel
print classificationModel.toDebugString()
print "==============================="
predicted_data = result.collect()
print(predicted_data)

zippedResult = test_data_rdd.map(lambda x: x.label).zip(result)

metrics = RegressionMetrics(zippedResult)

print(metrics.meanAbsoluteError)
print(metrics.meanSquaredError)
print(metrics.rootMeanSquaredError)
    training, test = labelData(data).randomSplit([0.8, 0.2])
    numTraining = training.count()
    numTest = test.count()

    def getPredictionsLabels(model, test):
        predictions = model.predict(test.map(lambda r: r.features))
        return predictions.zip(test.map(lambda r: r.label))

    def printMetrics(predictions_and_labels):
        metrics = RegressionMetrics(predictions_and_labels)
        f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance))
        f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError))
        f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError))
        f.write('Root Mean Squared Error:{0}\n'.format(
            metrics.rootMeanSquaredError))
        f.write('R^2 :{0}\n'.format(metrics.r2))

    timestart = datetime.datetime.now()
    model = GradientBoostedTrees.trainRegressor(training, categoricalFeaturesInfo = {},\
 loss='leastSquaresError', numIterations=10, learningRate=0.1, maxDepth=15, maxBins=16)
    f.write(model.toDebugString())
    predictions_and_labels = getPredictionsLabels(model, test)
    printMetrics(predictions_and_labels)
    timeend = datetime.datetime.now()
    timedelta = round((timeend - timestart).total_seconds(), 2)
    f.write("Time taken to execute this model is: " + str(timedelta) +
            " seconds.\n")

    f.close()
    sc.stop()
if model_type == "classification":
    model = GradientBoostedTrees.trainClassifier(
        lp,
        categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors),
        loss=loss_param,
        numIterations=numIterations_param,
        learningRate=learningRate_param,
        maxDepth=maxDepth_param,
        maxBins=maxBins_param)
else:
    # regression
    model = GradientBoostedTrees.trainRegressor(
        lp,
        categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors),
        loss=loss_param,
        numIterations=numIterations_param,
        learningRate=learningRate_param,
        maxDepth=maxDepth_param,
        maxBins=maxBins_param)

build_report = mbr.report(lp.count(),lp.getNumPartitions(),
    predictors,datamodel,target,model_type,
    settings=[("Algorithm","Gradient Boosted Trees",[("loss",loss_param),("numIterations",numIterations_param),("learningRate",learningRate_param),("maxDepth",maxDepth_param),("maxBins",maxBins_param)])])

print(build_report)

model.save(sc, modelpath)

model_metadata = { "target":target, "predictors":predictors, "datamodel": datamodel, "model_type":model_type }

print(model.toDebugString())
from pyspark import SparkConf, SparkContext
SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\")
import sys, pickle,math
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('random-forest')
sc = SparkContext(conf=conf)

input = sys.argv[1]

# Load and parse the data
def parsePoint(line):
    return LabeledPoint(float(line[1]), line[0])

train = sc.pickleFile(input+'/bow_train/part-00000')
test = sc.pickleFile(input+'/bow_test/part-00000')
parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0)
parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache()
model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1)
predictions = model.predict(parsedtest.map(lambda x: x.features))
labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions)
val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count())
parsedtest.unpersist()
RMSE=math.sqrt(val_err)

print("Root Mean Squared Error Test= " + str(RMSE))

all_data = np.array(zip(yy, xx))
sss = ShuffleSplit(len(all_data) - 1, test_size=0.20, random_state=1234)

for train_indexes, test_indexes in sss:
    lparr = []
    test_lp_arr = []
    sample_data = all_data[train_indexes]
    test_data = all_data[test_indexes]

    for medianvalue, record in sample_data:
        lp = LabeledPoint(medianvalue, tuple(record))
        lparr.append(lp)

    for medianvalue, record in test_data:
        lp = LabeledPoint(medianvalue, tuple(record))
        test_lp_arr.append(lp)

    training_data = sc.parallelize(lparr).cache()
    test_data_rdd = sc.parallelize(test_lp_arr).cache()

    regression_model = GradientBoostedTrees.trainRegressor(training_data, categoricalFeaturesInfo={}, numIterations=10,maxDepth=10)
    result = regression_model.predict(test_data_rdd.map(lambda x: x.features))
    print regression_model
    print regression_model.toDebugString()
    print "==============================="
    predicted_data = result.collect()
    actual_data = test_data_rdd.map(lambda x: float(x.label)).collect()

    print mean_absolute_error(actual_data, predicted_data)
    break
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample")
    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={}, numIterations=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
    sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingRegressionModel")
    # $example off$
Exemple #25
0
    def test_regression(self):
        from pyspark.mllib.regression import (
            LinearRegressionWithSGD,
            LassoWithSGD,
            RidgeRegressionWithSGD,
        )
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees

        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2]),
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4
        )
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1
        )
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4
        )
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()

        # Verify that maxBins is being passed through
        GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32
        )
        with self.assertRaises(Exception):
            GradientBoostedTrees.trainRegressor(
                rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1
            )
Exemple #26
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.linalg import SparseVector
from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf=conf)

sparse_data = [
    LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
    LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
    LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
    LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
]

data = sc.parallelize(sparse_data)

model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10)
model.numTrees()

model.totalNumNodes()

model.predict(SparseVector(2, {1: 1.0}))

model.predict(SparseVector(2, {0: 1.0}))

rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
print(model.predict(rdd).collect())

model.save(sc, 'model')
Exemple #27
0
# We have to do something here to cache the dataset, otherwise it hangs later on due to a PySpark bug
num_records = training_data.count()

print("     * Transformed data read!")
print("     * Training test ML model... ")

# Label the data points
labeled_data = training_data.map(lambda x: LabeledPoint(x[-1], x[:-1]))
# Separate training and testing data
train_data, test_data = labeled_data.randomSplit([0.8, 0.2])
# Do something again to avoid the PySpark bug hang from manifesting
num_train_recs = train_data.count()
num_test_recs = test_data.count()
# Train the model
ml_model = GradientBoostedTrees.trainRegressor(train_data, {},
                                               numIterations=20,
                                               loss='leastAbsoluteError')

print("     * Model trained!")
print("     * Testing model error... ")

# Predict and calculate error metrics
predictions = ml_model.predict(test_data.map(lambda r: r.features))
predictions = predictions.zip(test_data.map(lambda r: r.label))
metrics = RegressionMetrics(predictions)

print("     * Model regression error metrics: ")
print("         - Mean Absolute Error: %.2f" % metrics.meanAbsoluteError)
print("         - Mean Squared Error: %.2f" % metrics.meanSquaredError)
print("         - Root Mean Squared Error: %.2f" %
      metrics.rootMeanSquaredError)
Exemple #28
0
dirfilename = modelDir + rfclassificationfilename;

rfModel.save(sc, dirfilename);

# Convert to df
test_predictions = sqlContext.createDataFrame(predictionAndLabels)
test_predictions.registerTempTable("randomForest_results");
'''

## GRAD BOOSTED TREES ##

categoricalFeaturesInfo = {0: 2, 1: 2, 2: 6, 3: 4}
gbtModel = GradientBoostedTrees.trainRegressor(
    indexed_train_reg,
    categoricalFeaturesInfo=categoricalFeaturesInfo,
    numIterations=10,
    maxBins=32,
    maxDepth=4,
    learningRate=0.1)

predictions = gbtModel.predict(indexed_test_reg.map(lambda x: x.features))
predictionAndLabels = indexed_test_reg.map(lambda lp: lp.label).zip(
    predictions)

testMetrics = RegressionMetrics(predictionAndLabels)
print("RMSE = %s" % testMetrics.rootMeanSquaredError)
print("R-sqr = %s" % testMetrics.r2)

# Save model
datestamp = unicode(datetime.datetime.now()).replace(' ',
                                                     '').replace(':', '_')
def trainTestSaveALLModel(rddDir, encodedFeaturesParq, featuresNumValsFile):
    predictors = []
    modelType = ""
    if "batting" in encodedFeaturesParq:
        modelType = 'batting'
        predictors = hitterPredictors
    else:
        modelType = 'pitching'
        predictors = pitcherPredictors
    not_features.extend(predictors)
    # Load and parse the data file.
    features = sqlContext.read.parquet(encodedFeaturesParq).cache()
    print features.take(3)
    print "# features=", features.count()
    numVals = sqlContext.read.json(featuresNumValsFile).take(1)[0].asDict()
    (catFeatures, featureLookup) = getCatFeatures(features, numVals)
    all_fd_points_df = None
    fd_points_testData = None
    predictions = None
    for predictor in predictors:
        #global predictField
        #predictField = predictor
        #data = features.map(toLabeledPoint).coalesce(50)
        #data = toLabeledPoint(features, predictor).coalesce(50)
        #print "len data=", data.count()

        print "catFeatures=", catFeatures

        # Split the data into training and test sets (30% held out for testing)
        (f_trainingData, f_testData) = features.randomSplit([0.7, 0.3], seed=1)
        #trainingData = f_trainingData.map(toLabeledPoint).coalesce(50)
        trainingData = toLabeledPoint(f_trainingData, predictor).coalesce(50)
        #testData = f_testData.map(toLabeledPoint).coalesce(50)
        testData = toLabeledPoint(f_testData, predictor).coalesce(50)
        testData.cache()
        print "testData count=", testData.count()
        playerIds = f_testData.map(lambda x: str(x.player_id) + '_' + x.game_id).coalesce(50)
        print "playerIds=", playerIds
        print "playerIds=", playerIds.take(2)
        print "len playerIds=", playerIds.count()

        # Train a GradientBoostedTrees model.
        #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
        #         (b) Use more iterations in practice.
        model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo=catFeatures, maxDepth=5, numIterations=1, maxBins=300)

        # Evaluate model on test instances and compute test error
        predictions = model.predict(testData.map(lambda x: x.features)).cache()
        print "# predictions=", predictions.count()
        labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
        if fd_points_testData is None:
            fd_points_testData = f_testData.map(lambda x: (str(x.player_id) + '_' + x.game_id, x.fd_points)).toDF(['player_id', 'actual_fd_points']).coalesce(50)

        testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
        testMAE = labelsAndPredictions.map(lambda (v, p): abs(v - p)).sum() / float(testData.count())
        print predictor + ' Test Mean Squared  Error = ' + str(testMSE)
        print predictor + ' Test Mean Absolute Error = ' + str(testMAE)

        if all_fd_points_df is None:
            #all_fd_points_df = testData.map(lambda x: x.player_id).zip(predictions).toDF(['player_id', predictor]).cache()
            print "FIRST: # predictions=", predictions.count()
            print " # playerIds=", playerIds.count()
            all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('all_fd_points_df').cache()
            print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
            print "# all_fd_points_df", all_fd_points_df.count()
            print "first all_fd_points_df", all_fd_points_df.take(5)
            print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
        else:
            print "ELSE: # predictions=", predictions.count()
            print " # playerIds=", playerIds.count()
            curr_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('curr_fd_points_df')
            print "all_fd_points_df", all_fd_points_df.printSchema()
            print "PRE all_fd_points_df", all_fd_points_df.take(5)
            print "curr_fd_points_df", curr_fd_points_df.printSchema()
            print "few curr_fd_points_df", curr_fd_points_df.take(5)
            print "# curr_fd_points_df", curr_fd_points_df.count()
            print "distinct curr_fd_points_df", curr_fd_points_df.select('player_id').distinct().count()
            print "first curr", curr_fd_points_df.take(5)
            #all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, all_fd_points_df.player_id == curr_fd_points_df.player_id, 'inner').drop(curr_fd_points_df.player_id)
            all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, col("all_fd_points_df.player_id") == col("curr_fd_points_df.player_id")).drop(curr_fd_points_df.player_id).alias('all_fd_points_df').cache()
            print "second ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
            #print "all debugstring", all_fd_points_df.rdd.toDebugString()
            #print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
        print "first few all_fd_points_df=", all_fd_points_df.take(3)
        print "count few all_fd_points_df=", all_fd_points_df.count()
        print "converted:"
        print populateDebugString(model, featureLookup)

        # Save and load model
        modelFilename = rddDir + "pitching_" + predictor + "_model.RandomForest"
        if modelType == "batting":
            modelFilename = rddDir + "batting_" + predictor + "_model.RandomForest"
        try:
            shutil.rmtree(modelFilename)
        except OSError:
            pass
        model.save(sc, modelFilename)
        #sameModel = GradientBoostedTreesModel.load(sc, "myModelPath")
    print "DONE. all_fd_points_df", all_fd_points_df.printSchema()
    print "# of all_fd_points=", all_fd_points_df.count()
    print "first of all_fd_points=", all_fd_points_df.take(5)

    try:
        shutil.rmtree(rddDir + 'all_fd_points_df.csv')
    except OSError:
        pass
    all_fd_points_df.write.format('com.databricks.spark.csv').save(rddDir + 'all_fd_points_df.csv')
    allPredictions = None
    if len(predictors) > 1:
        allPredictions = all_fd_points_df.map(sumFD).toDF()
    else:
        allPredictions = all_fd_points_df.map(renameSumFD).toDF()
        print allPredictions.rdd.toDebugString()
        print "predf allPredictions=", allPredictions.take(5)
        #allPredictions = allPredictions.toDF()
    try:
        shutil.rmtree(rddDir + 'allPredictions.csv')
    except OSError:
        pass
    allPredictions.write.format('com.databricks.spark.csv').save(rddDir + 'allPredictions.csv')
    print "allPredictions=", allPredictions.take(5)
    print "# of allPredictions=", allPredictions.count()
    predict_and_actuals = allPredictions.join(fd_points_testData, allPredictions.player_id == fd_points_testData.player_id).drop(fd_points_testData.player_id)
    print "predict_and_actuals=", predict_and_actuals.take(3)
    #labelsAndPredictions = all_fd_points_df.map(lambda x: x.fd_points).zip(allPredictions).cache()
    labelsAndPredictions = predict_and_actuals
    print "labelsAndPredictions=", labelsAndPredictions.take(3)
    def mse(x):
        r = x.asDict()
        if r['actual_fd_points'] is None:
            r['actual_fd_points'] = 0.0
        return (r['actual_fd_points'] - r['fd_sum']) * (r['actual_fd_points'] - r['fd_sum'])

    def mae(x):
        r = x.asDict()
        if r['actual_fd_points'] is None:
            r['actual_fd_points'] = 0.0
        return abs(r['actual_fd_points'] - r['fd_sum'])
    testMSE = labelsAndPredictions.map(mse).sum() / float(allPredictions.count())
    testMAE = labelsAndPredictions.map(mae).sum() / float(allPredictions.count())
    print 'Merged ' + modelType + ' Test Mean Squared  Error = ' + str(testMSE)
    print 'Merged ' + modelType + ' Test Mean Absolute Error = ' + str(testMAE)
Exemple #30
0
from pyspark import SparkContext

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

if __name__ == "__main__":
    sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample")
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/newborn2013.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={
                                                    0: 3,
                                                    1: 4,
                                                    2: 2
                                                },
                                                numIterations=3)
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
    sameModel = GradientBoostedTreesModel.load(
        sc, "target/tmp/myGradientBoostingRegressionModel")

def extract_label(record):
    return float(record[-1])


data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))

data_with_idx_dt = data_dt.zipWithIndex().map(lambda p: (p[1], p[0]))
test_dt = data_with_idx_dt.sample(False, 0.3, 42)
train_dt = data_with_idx_dt.subtractByKey(test_dt)
train_data_dt = train_dt.map(lambda p: p[1])
test_data_dt = test_dt.map(lambda p: p[1])

#we will train the Gradient Boosted tree model simply using the default arguments to the trainRegressor method
gbt_model = GradientBoostedTrees.trainRegressor(train_data_dt, categoricalFeaturesInfo={}, numIterations=10,
                                                learningRate=0.01, maxDepth=1, maxBins=2)
predictions_GBT = gbt_model.predict(test_data_dt.map(lambda x: x.features))
true_vs_predicted_dt = test_data_dt.map(lambda lp: lp.label).zip(predictions_GBT)
print("Gradient Boosted Tree prediction:" + str(true_vs_predicted_dt.take(5)))

# Error Calculating Functions
# Mean Squared Error
def squared_error(actual, pred):
    return (pred - actual) ** 2


# Mean absolute Error
def abs_error(actual, pred):
    return np.abs(pred - actual)

def evaluate_gbt(train, test, numIterValue, maxDepth, maxBins):
    gbt_model = GradientBoostedTrees.trainRegressor(train, categoricalFeaturesInfo={}, numIterations=numIterValue, maxDepth=maxDepth, maxBins=maxBins)
    predictions_GBT = gbt_model.predict(test.map(lambda x: x.features))
    labelsAndPredictions_GBT = test.map(lambda lp: lp.label).zip(predictions_GBT)
    rmsleGBT = np.sqrt(labelsAndPredictions_GBT.map(lambda lp: squared_log_error(lp[0], lp[1])).mean())
    return rmsleGBT
Exemple #33
0
import sys
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

sc = SparkContext(appName="PythonWordCount")
data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim.txt')
traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_ssim.txt')
data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_720.txt')
data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_540.txt')
data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_360.txt')

model = GradientBoostedTrees.trainRegressor(traindata,
                                            categoricalFeaturesInfo={},
                                            numIterations=5)

predictions = model.predict(data.map(lambda x: x.features))
labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions)
MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(
    data.count())
print("training MSE = " + str(MSE))
labelsandpredictions.saveAsTextFile("/usr/hadoop/ssim_rbt")
predictions_720 = model.predict(data_720.map(lambda x: x.features))
labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip(
    predictions_720)
MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) *
                                       (v - p)).sum() / float(data_720.count())
print("training MSE_720 = " + str(MSE_720))
labelsandpredictions_720.saveAsTextFile("/usr/hadoop/ssim_720_rbt")
predictions_540 = model.predict(data_540.map(lambda x: x.features))
Exemple #34
0


Train,Test=df.randomSplit([0.8,0.2])

train_data=[];test_data=[]
for row in Train.rdd.collect():
     train_data.append( LabeledPoint(row[-1], list(row[:-1])))

y_test = []
X_test=[]
for row in Test.rdd.collect():
      y_test.append(row[-1]) 
      X_test.append(list(row[:-1]))


dir()
del data_f
del data_f
del df_filter
del  history_good_per
grm =GradientBoostedTrees.trainRegressor(sc.parallelize(train_data), {}, numIterations=1)
grm.save(sc, "file:///data/grm_model.model")

pred =  list(map(lambda x: grm.predict(x),X_test))

from pyspark.mllib.evaluation import RegressionMetrics
predictionAndObservations = sc.parallelize(zip(pred, y_test))
metrics = RegressionMetrics(predictionAndObservations)
 metrics.meanAbsoluteError
 metrics.meanSquaredError
Exemple #35
0
trainData, testData = train_test_split(option,test_size=0.2,random_state=42)
train = trainData.as_matrix()
test = testData.as_matrix()
def parsePoint(line):
    return LabeledPoint(line[7],line[0:7])
# create RDD
trainRDD = sc.parallelize(train)
testRDD = sc.parallelize(test)
trainLP = trainRDD.map(parsePoint)
testLP = testRDD.map(parsePoint)


# In[122]:

# build GB model
GBmodel = GradientBoostedTrees.trainRegressor(trainLP,
                                            categoricalFeaturesInfo={5:2}, numIterations=3)
predictions = GBmodel.predict(testLP.map(lambda x: x.features))
sparkGBError = testLP.map(lambda lp: lp.label).zip(predictions)
# compute MSE
testMSE = sparkGBError.map(lambda v: (v[0] - v[1])**2).sum() / float(testLP.count())


# In[124]:

testMSE


# In[111]:

# build SVM model
from pyspark.mllib.classification import SVMWithSGD, SVMModel
Exemple #36
0

testFinal.collect()


#For Getting the threshold limit, Using Train dataset

(training1, training2) = trainFinal.randomSplit([0.7, 0.3])

training1.collect()


model_1 = RandomForest.trainRegressor(training1, categoricalFeaturesInfo={},
                                    numTrees=3, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=4, maxBins=32)
model_2 = GradientBoostedTrees.trainRegressor(training1,
                                            categoricalFeaturesInfo={}, numIterations=3)
model_3 = DecisionTree.trainRegressor(training1, categoricalFeaturesInfo={},
                                    impurity='variance', maxDepth=5, maxBins=32)


predictionsRFTrain = model_1.predict(training1.map(lambda x: x.features))
predictionsGBTTrain = model_2.predict(training1.map(lambda x: x.features))
predictionsDTTrain = model_3.predict(training1.map(lambda x: x.features))

predictionsRFTrain.collect()

predictionsGBTTrain.collect()

predictionsDTTrain.collect()

training1.collect()
def trainTestSaveFDPointsModel(rddDir, encodedFeaturesParq, featuresNumValsFile):
    modelType = ""
    if "batting" in encodedFeaturesParq:
        modelType = 'batting'
    else:
        modelType = 'pitching'
    predictor = 'fd_points'
    not_features.extend(predictor)
    # Load and parse the data file.
    features = sqlContext.read.parquet(encodedFeaturesParq).cache()
    print features.take(3)
    print "# features=", features.count()
    numVals = sqlContext.read.json(featuresNumValsFile).take(1)[0].asDict()
    (catFeatures, featureLookup) = getCatFeatures(features, numVals)
    all_fd_points_df = None
    fd_points_testData = None
    predictions = None

    print "catFeatures=", catFeatures

    # Split the data into training and test sets (30% held out for testing)
    (f_trainingData, f_testData) = features.randomSplit([0.7, 0.3], seed=1)
    #trainingData = f_trainingData.map(toLabeledPoint).coalesce(50)
    trainingData = toLabeledPoint(f_trainingData, predictor).coalesce(50)
    #testData = f_testData.map(toLabeledPoint).coalesce(50)
    testData = toLabeledPoint(f_testData, predictor).coalesce(50)
    testData.cache()
    print "testData count=", testData.count()
    playerIds = f_testData.map(lambda x: str(x.player_id) + '_' + x.game_id).coalesce(50)
    print "playerIds=", playerIds
    print "playerIds=", playerIds.take(2)
    print "len playerIds=", playerIds.count()

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo=catFeatures, maxDepth=6, numIterations=32, maxBins=300)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features)).cache()
    print "# predictions=", predictions.count()
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    fd_points_testData = f_testData.map(lambda x: (str(x.player_id) + '_' + x.game_id, x.fd_points or 0.0)).toDF(['player_id', 'actual_fd_points']).coalesce(50)

    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
    testMAE = labelsAndPredictions.map(lambda (v, p): abs(v - p)).sum() / float(testData.count())
    print predictor + ' Test Mean Squared  Error = ' + str(testMSE)
    print predictor + ' Test Mean Absolute Error = ' + str(testMAE)

#    print " # playerIds=", playerIds.count()
#    all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('all_fd_points_df').cache()
#    print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema()
#    print "# all_fd_points_df", all_fd_points_df.count()
#    print "first all_fd_points_df", all_fd_points_df.take(5)
#    print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count()
    print "converted:"
    print populateDebugString(model, featureLookup)

    # Save and load model
    modelFilename = rddDir + "pitching_" + predictor + "_model.RandomForest"
    if modelType == "batting":
        modelFilename = rddDir + "batting_" + predictor + "_model.RandomForest"
    try:
        shutil.rmtree(modelFilename)
    except OSError:
        pass
    model.save(sc, modelFilename)

    fd_points_testData_filename = rddDir + modelType + '_' + 'fd_points_testData.csv'
    try:
        shutil.rmtree(fd_points_testData_filename)
    except OSError:
        pass
    fd_points_testData.write.format('com.databricks.spark.csv').option('header', 'true').save(fd_points_testData_filename)
Exemple #38
0
(trainingData, testData) = labeledPoints.randomSplit([0.7, 0.3])

# COMMAND ----------

labeledPoints.collect()

# COMMAND ----------

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
# Train a GradientBoostedTrees model.
#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
#         (b) Use more iterations in practice.
model = GradientBoostedTrees.trainRegressor(trainingData,
                                            categoricalFeaturesInfo={},
                                            numIterations=3)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) *
                                   (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression GBT model:')
print(model.toDebugString())

# COMMAND ----------

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils