Example #1
0
def cross_validation_gb(Data_1,Data_2,Data_3,loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train_1 = GradientBoostedTrees.trainRegressor(Data_1.union(Data_2), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features))
    labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1)
    testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_3.count())

    model_train_2 = GradientBoostedTrees.trainRegressor(Data_2.union(Data_3), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features))
    labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2)
    testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_1.count())

    model_train_3 = GradientBoostedTrees.trainRegressor(Data_3.union(Data_1), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features))
    labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3)
    testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_2.count())

    return (testMSE_1+testMSE_2+testMSE_3)/3
Example #2
0
def run_GBDT(input_file,output_file,iterations):
    dataRDD=sc.textFile(input_file).map(lambda x: x.replace('\t',','))
    #Now let us create labeled point from data
    dataRDDParsed=dataRDD.map(parsePoint).cache()
    featSet=dataRDDParsed.flatMap(lambda x: x).map(maaro).reduceByKey(lambda a,b: a+b).takeOrdered(26,lambda (k,v): -v)
    #reduceByKey(lambda x,y:x+y).takeOrdered(25,lambda (k,v):-v)
    #print featSet
    #OHEdict=createOneHotDict(dataRDDParsed,featSet)
    OHEdict={}
    for i,x in enumerate(featSet):
#         print i,x
        OHEdict[x[0]]=i
   
    #print oneHotEncoding(dataRDDParsed,OHEdict,numSampleOHEFeats,)
    #Now let us create a dictionary of points
#     weights=[.8,.1,.1]
#     seed=42
#     trainRDD,validateRDD,testRDD=dataRDD.randomSplit(weights,seed)
#     OHETrainData = trainRDD.map(lambda point: parseOHEPoint(point, OHEdict, 39))
    OHETrainData = dataRDD.map(lambda point: parseOHEPoint(point, OHEdict, 39))
#     print OHETrainData.take(1)
#     print OHETrainData.count()

    model = (GradientBoostedTrees.trainClassifier(OHETrainData, loss = 'logLoss', numIterations=2, 
             categoricalFeaturesInfo={}, learningRate = 0.1, maxDepth = 7, maxBins = 2))
    
    sc.parallelize([model.toDebugString()]).coalesce(1).saveAsTextFile(output_file)  
    def train(self):
        neg_df = spark.read.format(
            'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat'
        ).option('header', 'true').load('neg.csv')
        pos_df = spark.read.format(
            'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat'
        ).option('header', 'true').load('pos.csv')
        test_pos_df = spark.read.format(
            'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat'
        ).option('header', 'true').load('ptest.csv')
        test_neg_df = spark.read.format(
            'org.apache.spark.sql.execution.datasources.csv.CSVFileFormat'
        ).option('header', 'true').load('ntest.csv')
        training_df = neg_df.union(pos_df)
        test_df = test_neg_df.union(test_pos_df)

        labelpointRdd = training_df.rdd.map(featureExtraction).map(
            lambda x: LabeledPoint(x[0], x[1:])).cache()
        TestlabelpointRdd = test_df.rdd.map(featureExtraction).map(
            lambda x: LabeledPoint(x[0], x[1:])).cache()

        GBTmodel = GradientBoostedTrees.trainClassifier(
            labelpointRdd, categoricalFeaturesInfo={}, numIterations=75)
        predictions = GBTmodel.predict(
            TestlabelpointRdd.map(lambda x: x.features))
        labelsAndPredictions = TestlabelpointRdd.map(lambda lp: lp.label).zip(
            predictions)

        # save model
        GBTmodel.save(sc, '.')
        return score(labelsAndPredictions)
 def train_model(cls, trianData, cateFeaInfo={}, iterTimes=3):
     """
     训练模型
     """
     model = GradientBoostedTrees.trainClassifier(trianData, \
         categoricalFeaturesInfo=cateFeaInfo, numIterations=iterTimes)
     return model
Example #5
0
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth):
    removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Creating a list of features to be used for predictions
    removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id'])
    newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features")

    transformed_final= assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
Example #6
0
def main(sc, sql_context, is_hive=True):
    lp_train = MLUtils.loadLabeledPoints(sc,
                                         "bintrade.ml.diff.label_point.train")
    lp_check = MLUtils.loadLabeledPoints(sc,
                                         "bintrade.ml.diff.label_point.check")

    model = GradientBoostedTrees.trainRegressor(lp_train, {},
                                                numIterations=50,
                                                maxDepth=10)

    preds = model.predict(lp_check.map(lambda x: x.features))
    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=False)

    for each in labels_and_preds.take(100):
        print each

    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=True)
    for each in labels_and_preds.take(100):
        print each

    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count()
    print mse
    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count()
    print mse
Example #7
0
def trainevaluatemodel_gbdt(traindata,validationdata,loss,numiterations,learningrate,maxdepth,maxbins):
    starttime=time()
    model=GradientBoostedTrees.trainClassifier(traindata, categoricalFeaturesInfo={}, loss=loss, numIterations=numiterations, learningRate=learningrate,maxDepth=maxdepth, maxBins=maxbins)
    index=evaluation(model,validationdata)
    duration=time()-starttime
    print('Param:'+'\n'+'loss:'+str(loss)+'\n'+'numiterations:'+str(numiterations)+'\n'+'learningrate:'+str(learningrate)+'\n'+'maxdepth:'+str(maxdepth)+'\n'+'maxbins:'+str(maxbins)+'\n'+'time:'+str(duration)+'\n'+'index:'+str(index))
    return (loss,numiterations,learningrate,maxdepth,maxbins,duration,index)
 def train_model(cls, trianData, cateFeaInfo={}, iterTimes=3):
     """
     训练模型
     """
     model = GradientBoostedTrees.trainClassifier(trianData, \
         categoricalFeaturesInfo=cateFeaInfo, numIterations=iterTimes)
     return model
Example #9
0
def ctr_gbdt(file_dir):
    sc = SparkContext(appName="CTRGBDTRegression")

    path = file_dir + CTR_TRAINING_DATA + "/part*"
    data = sc.textFile(path)
    (training_data, testData) = data.randomSplit([0.7, 0.3])
    parsed_train_data = training_data.map(_parse_point)
    parsed_test_data = testData.map(_parse_point)

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainClassifier(parsed_train_data,
                                                 categoricalFeaturesInfo={},
                                                 numIterations=100)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(parsed_test_data.map(lambda x: x.features))
    labels_and_predictions = parsed_test_data.map(lambda lp: lp.label).zip(
        predictions)
    test_err = labels_and_predictions.filter(
        lambda vp: vp[0] != vp[1]).count() / float(parsed_test_data.count())

    logger = logging.getLogger()
    logger.debug('GBDT Training Error = ' + str(test_err))
    logger.debug('Learned classification GBT model:')
    logger.debug(model.toDebugString())
    logger.debug("Tree totalNumNodes" + str(model.totalNumNodes()))

    # Save and load model
    ctr_gbdt_data = file_dir + CTR_GBDT_DATA
    model.save(sc, ctr_gbdt_data)

    logger.info("GBDT training finished")
Example #10
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
Example #11
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
        except ValueError:
            self.fail()
Example #12
0
def Regression_Model(filename):
    open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data(
        filename)
    output = []
    for i in range(1, len(Date)):
        tmp = LabeledPoint(label=True_price_train[i],
                           features=[close_price_train[i]])
        output.append(tmp)

    output_train_RDD = sc.parallelize(output).cache()
    lrm = LinearRegressionWithSGD.train(output_train_RDD,
                                        step=0.001,
                                        iterations=100000)
    tree = DecisionTree.trainRegressor(output_train_RDD,
                                       categoricalFeaturesInfo={},
                                       impurity='variance',
                                       maxDepth=5,
                                       maxBins=30)
    forest = RandomForest.trainRegressor(output_train_RDD,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='variance',
                                         maxDepth=5,
                                         maxBins=30)
    gradient = GradientBoostedTrees.trainRegressor(output_train_RDD,
                                                   categoricalFeaturesInfo={},
                                                   numIterations=10)

    print("\n============MODEL Evaluation=============\n")
    model_name = [
        'LinearRegression', 'DecisionTree', 'RandomForest',
        'GradientBoostedTrees'
    ]
    es_modelname = ['lrm', 'tree', 'forest', 'gradient']
    result = ''
    x = 0
    err = 1000
    test_model = 'LinearRegression'
    #此处更换不同的RDD
    output_model_RDD = lrm
    for model in [lrm, tree, forest, gradient]:
        predictions = model.predict(output_train_RDD.map(lambda x: x.features))
        labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip(
            predictions)
        MSE = (
            labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /
            float(output_train_RDD.count()))**0.5
        #print ("Predictions: ", valuesAndPreds.take(10))
        result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n"
        if (err > MSE):
            err = MSE
            output_model = model
            es_model = es_modelname[x]
        x += 1
    print(result)
    print(es_model)
    return Date, True_price, output_model_RDD, open_price, close_price, es_model
Example #13
0
def main():
    text = sc.textFile(inputs)

    nltk_data_path = "[change yo your own nltk_data location]"  # maybe changed to the sfu server path
    nltk.data.path.append(nltk_data_path)
    cleaned_review = text.map(clean_reviewf).cache()

    reviews_txt = cleaned_review.map(lambda review: review['reviewText'])
    reviews = cleaned_review.map(lambda review: (review['overall'], review['reviewText'], review['reviewTime'])).cache()
    training_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year < 2014)
    testing_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year == 2014)
    training_data = training_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache()
    testing_data = testing_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache()

    training_rating = training_data.map(lambda ((rating, review_text), review_index): (review_index, rating))
    training_review_text = training_data.map(lambda ((rating, review_text), review_index): (review_index, review_text))
    training_review_text_flat = training_review_text.flatMapValues(myf)
    training_review_text_flat = training_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index))

    testing_rating = testing_data.map(lambda ((rating, review_text), review_index): (review_index, rating))
    testing_review_text = testing_data.map(lambda ((rating, review_text), review_index): (review_index, review_text))
    testing_review_text_flat = testing_review_text.flatMapValues(myf)
    testing_review_text_flat = testing_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index))

    word2vec_model = generate_word2vec_model(reviews_txt)
    mv = word2vec_model.getVectors()
    # this step seems redundant but necessary
    mvdct = []
    for k,v in mv.items():
        vec = [f for f in v]
        mvdct.append((k,vec))
    dct_rdd = sc.parallelize(mvdct)

    training_feature_vecs = dct_rdd.join(training_review_text_flat)
    training_vecs = training_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1)))
    training_reduce_vecs = training_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1]))
    training_avg_vecs = training_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct)))
    training_rating_avgf = training_rating.join(training_avg_vecs)
    training_lps = training_rating_avgf.map(get_lp)

    testing_feature_vecs = dct_rdd.join(testing_review_text_flat)
    testing_vecs = testing_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1)))
    testing_reduce_vecs = testing_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1]))
    testing_avg_vecs = testing_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct)))
    testing_rating_avgf = testing_rating.join(testing_avg_vecs)
    testing_lps = testing_rating_avgf.map(get_lp)

    gbt_model = GradientBoostedTrees.trainClassifier(training_lps,
                                                 categoricalFeaturesInfo={}, numIterations=20)
    predictions = gbt_model.predict(testing_lps.map(lambda x: x.features))
    labelsAndPredictions = testing_lps.map(lambda lp: lp.label).zip(predictions)
    MSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /float(testing_lps.count())
    RMSE = math.sqrt(MSE)
    result = str(RMSE)

    outdata = sc.parallelize([result])
    outdata.saveAsTextFile(output)
Example #14
0
def cross_validation_gb(Data_1, Data_2, Data_3, loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train_1 = GradientBoostedTrees.trainRegressor(
        Data_1.union(Data_2),
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features))
    labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1)
    testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_3.count())

    model_train_2 = GradientBoostedTrees.trainRegressor(
        Data_2.union(Data_3),
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features))
    labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2)
    testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_1.count())

    model_train_3 = GradientBoostedTrees.trainRegressor(
        Data_3.union(Data_1),
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features))
    labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3)
    testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(Data_2.count())

    return (testMSE_1 + testMSE_2 + testMSE_3) / 3
Example #15
0
def testRegression(trainingData, testData, model_path):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3, maxDepth=4)
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() / float(testData.count())
    print("Test Mean Squared Error = " + str(testMSE))
    print("Learned regression GBT model:")
    print(model.toDebugString())
    model.save(sc, model_path)
Example #16
0
def testClassification(trainingData, testData):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count() / float(testData.count())
    print("Test Error = " + str(testErr))
    print("Learned classification ensemble model:")
    print(model.toDebugString())
Example #17
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Example #18
0
def validation_gb(trainingData,testData, loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                                      loss=loss_type,
                                                      numIterations=num_iter, maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions = model_train.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    return testMSE
Example #19
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Example #20
0
def GBDT_train(data, filename):
    data_train = split_data(data, [0.5, 0.5])
    key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1]))
    training, test = key_FT.randomSplit([0.8, 0.2], 0)
    model_GBDT = GradientBoostedTrees.trainClassifier(training, {},
                                                      numIterations=20)
    predictionAndlabel = test.map(
        lambda x: (float(model_GBDT.predict(x.features)), x.label))
    accuracy = 1.0 * predictionAndlabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print("accuracy of model_GBDT:%f" % accuracy)
    pre_all(data, model_GBDT, filename)
    return model_GBDT, accuracy
def testRegression(trainingData, testData):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                                numIterations=30, maxDepth=4)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \
        / float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression ensemble model:')
    print(model.toDebugString())
Example #22
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Example #23
0
def main():
    #Reading train and test data
    trainData  = sc.pickleFile(input+'/Train_data.average/part-00000')
    testData = sc.pickleFile(input+'/Test_data.average/part-00000')
    parsedData=trainData.map(parseInput).filter(lambda line:len(line.features)!=0 or len(line.label)!=0)
    parsedTestData = testData.map(parseInput).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache()
    model = GradientBoostedTrees.trainRegressor(parsedData,categoricalFeaturesInfo={}, numIterations=1)
    predictions = model.predict(parsedTestData.map(lambda x: x.features))
    labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip(predictions)
    validationErr = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedTestData.count())
    parsedTestData.unpersist()
    RMSE=math.sqrt(validationErr)

    print("Root Mean Squared Error Test= " + str(RMSE))
Example #24
0
def testRegression(trainingData, testData):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={},
                                                numIterations=30,
                                                maxDepth=4)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \
        / float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression ensemble model:')
    print(model.toDebugString())
Example #25
0
def validation_gb(trainingData, testData, loss_type, num_iter, maxDepth):
    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(
        trainingData,
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Evaluate model on test instances and compute test error
    predictions = model_train.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    return testMSE
Example #26
0
def testClassification(trainingData, testData):
    # Train a GradientBoostedTrees model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = GradientBoostedTrees.trainClassifier(trainingData,
                                                 categoricalFeaturesInfo={},
                                                 numIterations=30,
                                                 maxDepth=4)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda v_p: v_p[0] != v_p[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification ensemble model:')
    print(model.toDebugString())
Example #27
0
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth):
    removelist_train = set(
        ['stars', 'business_id', 'bus_id', 'b_id', 'review_id', 'user_id'])
    newlist_train = [
        v for i, v in enumerate(train_data.columns)
        if v not in removelist_train
    ]

    # Putting data in vector assembler form
    assembler_train = VectorAssembler(inputCols=newlist_train,
                                      outputCol="features")

    transformed_train = assembler_train.transform(train_data.fillna(0))

    # Creating input dataset in the form of labeled point for training the model
    data_train = (transformed_train.select(
        "features",
        "stars")).map(lambda row: LabeledPoint(row.stars, row.features))

    # Training the model using Gradient Boosted Trees regressor
    model_train = GradientBoostedTrees.trainRegressor(
        sc.parallelize(data_train.collect(), 5),
        categoricalFeaturesInfo={},
        loss=loss_type,
        numIterations=num_iter,
        maxDepth=maxDepth)

    # Creating a list of features to be used for predictions
    removelist_final = set(
        ['business_id', 'bus_id', 'b_id', 'review_id', 'user_id'])
    newlist_final = [
        v for i, v in enumerate(test_data.columns) if v not in removelist_final
    ]

    # Putting data in vector assembler form
    assembler_final = VectorAssembler(inputCols=newlist_final,
                                      outputCol="features")

    transformed_final = assembler_final.transform(test_data.fillna(0))

    # Creating input dataset to be used for predictions
    data_final = transformed_final.select("features", "review_id")

    # Predicting ratings using the developed model
    predictions = model_train.predict(data_final.map(lambda x: x.features))
    labelsAndPredictions = data_final.map(
        lambda data_final: data_final.review_id).zip(predictions)
    return labelsAndPredictions
def Gradient_BoostedTrees(filename, sc):
	# Load and parse the data file.
	data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt")
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, testData) = data.randomSplit([0.7, 0.3])

	# Train a GradientBoostedTrees model.
	#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
	#         (b) Use more iterations in practice.
	model = GradientBoostedTrees.trainClassifier(trainingData,
	                                             categoricalFeaturesInfo={}, numIterations=3)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification GBT model:')
	print(model.toDebugString())
def fit(model, df, param):
    spark = model['spark']
    sc = spark.sparkContext
    feature_variables = param['feature_variables']
    target_variable = param['target_variables'][0]
    iterations = 10
    if 'options' in param:
        if 'params' in param['options']:
            if 'iterations' in param['options']['params']:
                iterations = int(param['options']['params']['iterations'])
    sdf = spark.createDataFrame(df)
    rdd = sdf.rdd.map(lambda row: LabeledPoint(row[
        target_variable], [row[x] for x in feature_variables]))

    model['model'] = GradientBoostedTrees.trainClassifier(
        rdd, categoricalFeaturesInfo={}, numIterations=iterations)

    info = {"message": "model trained"}
    return info
Example #30
0
def crossValidator(IterNums,dataset_rdd,rate):
	dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
	dataset_negotive =  dataset_rdd.filter(lambda e:e[1]<0.5)
	# dataset_positive1,dataset_positive2,dataset_positive3,dataset_positive4,dataset_positive5 = dataset_positive.randomSplit([1,1,1,1,1])
	# dataset_negotive1,dataset_negotive2,dataset_negotive3,dataset_negotive4,dataset_negotive5 = dataset_negotive.randomSplit([1,1,1,1,1])
	dataset_positive_list = dataset_positive.randomSplit([1,1,1,1,1])
	dataset_negotive_list = dataset_negotive.randomSplit([1,1,1,1,1])
	result = []
        #result2 = []
	for i in range(5):
		testset_positive = dataset_positive_list[i].count()
		testset_rdd = dataset_positive_list[i].union(dataset_negotive_list[i])
        testset_count = testset_rdd.count()
		trainset_rdd = dataset_rdd.subtract(testset_rdd)
		trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
		testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
		model = GradientBoostedTrees.trainClassifier(trainset, {}, numIterations=IterNums,learningRate = rate)
        	#model2 = LogisticRegressionWithLBFGS.train(trainset,iterations = 100)
		predictions = model.predict(testset.map(lambda x:x.features))
                #predictions2 = model2.predict(testset.map(lambda x:x.features))
		predict = testset.map(lambda lp: lp.label).zip(predictions)
                #predict2 = testset.map(lambda lp:lp.label).zip(predictions2)
		hitALL =predict.filter(lambda e:e[0]==e[1]).count()
                #hitALL2 = predict2.filter(lambda e:e[0]==e[1]).count()
		hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
                #hitPositive2 = predict2.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
		positive = predict.filter(lambda e:e[1]>0.5).count()
                #positive2 = predict2.filter(lambda e:e[1]>0.5).count()
		recall = hitPositive/float(testset_positive)
                #recall2 = hitPositive2/float(testset_positive)
		precision = hitPositive/float(positive)
                #precision2 = hitPositive2/float(positive2)
		accuracy = hitALL/float(testset_count)
                #accuracy2 = hitALL2/float(testset_count)
		F_Value = 2/(1/precision+1/recall)
                #F_Value2 = 2/(1/precision2+1/recall2)
		result.append((precision,recall,accuracy,F_Value,hitPositive,positive,testset_positive,testset_count))
def precreate_models(train_data):
    models = list()

    for depth in range(9, 10):
        for num_trees in range(4, 10, 3):
            for impurity in ['entropy']:  # ['gini', 'entropy']
                for feature in [
                        'onethird'
                ]:  # ['auto', 'all', 'sqrt', 'log2', 'onethird']
                    models.append(
                        RandomForest.trainClassifier(
                            train_data,
                            numClasses=10,
                            categoricalFeaturesInfo={},
                            numTrees=num_trees,
                            featureSubsetStrategy=feature,
                            impurity=impurity,
                            maxDepth=depth,
                            maxBins=32))

    for iters in range(9, 10):
        for rate in np.linspace(0.1, 1, 2):
            for depth in range(9, 10):
                for loss in [
                        'leastSquaresError'
                ]:  # ['logLoss', 'leastSquaresError', 'leastAbsoluteError']
                    models.append(
                        GradientBoostedTrees.trainClassifier(
                            train_data,
                            categoricalFeaturesInfo={},
                            loss=loss,
                            numIterations=iters,
                            learningRate=rate,
                            maxDepth=depth))

    return models
Example #32
0
def main():
    records = get_records()
    first = records.first()
    records.cache()

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))
    first_point = data.first()

    gbt_model = GradientBoostedTrees.trainRegressor(data,
                                                    categoricalFeaturesInfo={},
                                                    numIterations=3)
    true_vs_predicted_gbt = data.map(lambda p:
                                     (p.label, gbt_model.predict(p.features)))

    predictions = gbt_model.predict(data.map(lambda x: x.features))
    labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions)
    print "GradientBoosted Trees predictions: " + str(
        labelsAndPredictions.take(5))

    mse = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(data.count())
    mae = labelsAndPredictions.map(lambda (v, p): np.abs(v - p)).sum() /\
        float(data.count())
    rmsle = labelsAndPredictions.map(lambda (v,p) :  ((np.log(p + 1) - np.log(v + 1))**2)).sum() /\
        float(data.count())
    print('Gradient Boosted Trees - Mean Squared Error = ' + str(mse))
    print('Gradient Boosted Trees - Mean Absolute Error = ' + str(mae))
    print('Gradient Boosted Trees - Mean Root Mean Squared Log Error = ' +
          str(rmsle))
Example #33
0
# We have to do something here to cache the dataset, otherwise it hangs later on due to a PySpark bug
num_records = training_data.count()

print("     * Transformed data read!")
print("     * Training test ML model... ")

# Label the data points
labeled_data = training_data.map(lambda x: LabeledPoint(x[-1], x[:-1]))
# Separate training and testing data
train_data, test_data = labeled_data.randomSplit([0.8, 0.2])
# Do something again to avoid the PySpark bug hang from manifesting
num_train_recs = train_data.count()
num_test_recs = test_data.count()
# Train the model
ml_model = GradientBoostedTrees.trainRegressor(train_data, {},
                                               numIterations=20,
                                               loss='leastAbsoluteError')

print("     * Model trained!")
print("     * Testing model error... ")

# Predict and calculate error metrics
predictions = ml_model.predict(test_data.map(lambda r: r.features))
predictions = predictions.zip(test_data.map(lambda r: r.label))
metrics = RegressionMetrics(predictions)

print("     * Model regression error metrics: ")
print("         - Mean Absolute Error: %.2f" % metrics.meanAbsoluteError)
print("         - Mean Squared Error: %.2f" % metrics.meanSquaredError)
print("         - Root Mean Squared Error: %.2f" %
      metrics.rootMeanSquaredError)
Example #34
0
trainingData = trainingData.cache()
testData = testData.cache()

from time import time
errors={}
cfi = {}
depth = 9
lr = 0.3
lossfunc = "logLoss"
#stopER = 0.27498
stopER = 0.2745
testER = 1.0
attemption = 0
while testER>=stopER:
    start=time()
    model=GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo=cfi,loss="logLoss",maxDepth=depth,numIterations=10,learningRate=lr)
    errors[depth]={}
    dataSets={'train':trainingData,'test':testData}
    for name in ['test','train']:  # Calculate errors on train and test sets
        data=dataSets[name]
        Predicted=model.predict(data.map(lambda x: x.features))
        LabelsAndPredictions=data.map(lambda lp:lp.label).zip(Predicted)
        Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
        errors[depth][name]=Err
        if name=='test':
            if Err>=stopER:
                attemption+=1
                break
            else:
                testER = Err
print depth,errors[depth]
from pyspark.context import SparkContext
from pyspark.mllib.tree import GradientBoostedTrees
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

def parsePoint(line):
    values = [float(x.strip()) for x in line.split(',')]
    return LabeledPoint(values[-1],values[1:10])

data = sc.textFile("heart_disease.csv")
data = sc.textFile("heart_disease.csv")
data = data.map(parsePoint)


(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={},
                                                 numIterations=30, maxDepth=4)


# This works too!
train = sc.textFile("train.csv")
def parsePoint(line):
    values = [float(x.strip()) for x in line.split(',')]
    return LabeledPoint(values[-1],values[:65])
train = train.map(parsePoint)
model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo={},
                                                 numIterations=300, maxDepth=2,learningRate=0.1)
test = sc.textFile("test.csv")
test = test.map(parsePoint)
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
Example #36
0

path='/covtype/covtype.data'
inputRDD=sc.textFile(path)

Label=2.0

Data = inputRDD.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint((V[-1]==Label), V[:-1])).cache()

(trainingData,testData) = Data.randomSplit([0.7,0.3],seed=255)

from time import time
errors={}
catInfo = {}
for i in range(10,54):
    catInfo[i] = 2

for depth in [10]:
    start=time()
    model=GradientBoostedTrees.trainClassifier(trainingData,learningRate = 0.2, numIterations = 30, maxDepth = depth,
                                               categoricalFeaturesInfo=catInfo)
    errors[depth]={}
    dataSets={'train':trainingData,'test':testData}
    for name in dataSets.keys():  # Calculate errors on train and test sets
        data=dataSets[name]
        Predicted=model.predict(data.map(lambda x: x.features))
        LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted)
        Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
        errors[depth][name]=Err
print errors
Example #37
0
Data=inputRDD.map(lambda line: [float(x) for x in line.split(',')]).map(lambda V:LabeledPoint(1.0, V[:-1]) if V[-1] == 2.0 else LabeledPoint(0.0, V[:-1])).cache()


# ### Reducing data size

# In[11]:

(trainingData,testData)=Data.randomSplit([0.7,0.3],seed=255)
trainingData.cache()
testData.cache()

# ### Gradient Boosted Trees

# In[13]:

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel

errors={}
for depth in [14]:
    model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=15, maxDepth=depth)
    errors[depth]={}
    dataSets={'train':trainingData,'test':testData}
    for name in dataSets.keys():  # Calculate errors on train and test sets
        data=dataSets[name]
        Predicted=model.predict(data.map(lambda x: x.features))
        LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted)
        Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
        errors[depth][name]=Err
    print depth,errors[depth]
Example #38
0
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

# API docs
# https://spark.apache.org/docs/1.4.0/api/python/pyspark.mllib.html

# Function to convert .csv files into 'LabeledPoint' format
def parsePoint(line):
    values = [float(x.strip()) for x in line.split(',')]
    return LabeledPoint(values[-1],values[:65])

# Load .csv data
train_csv = sc.textFile("train.csv")
test_csv = sc.textFile("test.csv")

# Convert the data to LabeledPoint format
train_parsed = train_csv.map(parsePoint)
test_parsed = test_csv.map(parsePoint)

# Build a GBM / TreeNet model
model = GradientBoostedTrees.trainClassifier(
	train_parsed, loss='leastSquaresError', o
	categoricalFeaturesInfo={},numIterations=300, 
	maxDepth=2,learningRate=0.1)

# Get predictions and see how it did
predictions = model.predict(test_parsed.map(lambda x: x.features))
labelsAndPredictions = test_parsed.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda t: t[0] != t[1]).count() / float(test_parsed.count())
print(testErr)
Example #39
0
# $example off$

if __name__ == "__main__":
    sc = SparkContext(
        appName="PythonGradientBoostedTreesClassificationExample")
    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainClassifier(trainingData,
                                                 categoricalFeaturesInfo={},
                                                 numIterations=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingClassificationModel")
    sameModel = GradientBoostedTreesModel.load(
        sc, "target/tmp/myGradientBoostingClassificationModel")
# data model is basically a dict which maps from column name to either {"min":x, "max":y } for numeric fields and [val1,val2, ...valN] for string fields
datamodel = dmt.computeDataModel(df)

DataModelTools.checkTargetForModelType(datamodel,target,model_type)

# use DataModelTools to convert from DataFrame to an RDD of LabelledPoint for specified target/predictors
lp = dmt.extractLabelledPoint(df,target,predictors).map(lambda x:x[1]).cache()

# build the decision tree model
from pyspark.mllib.tree import GradientBoostedTrees

if model_type == "classification":
    model = GradientBoostedTrees.trainClassifier(
        lp,
        categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors),
        loss=loss_param,
        numIterations=numIterations_param,
        learningRate=learningRate_param,
        maxDepth=maxDepth_param,
        maxBins=maxBins_param)
else:
    # regression
    model = GradientBoostedTrees.trainRegressor(
        lp,
        categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors),
        loss=loss_param,
        numIterations=numIterations_param,
        learningRate=learningRate_param,
        maxDepth=maxDepth_param,
        maxBins=maxBins_param)

build_report = mbr.report(lp.count(),lp.getNumPartitions(),

# In[8]:

#Split the training set and test set
(trainingData, testData) = data.randomSplit([0.7, 0.3])


# In[9]:

#Training model
RF_model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto", 
                                        impurity='gini', maxDepth=5, maxBins=32)

GB_model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3)


# In[10]:

#Predication
def cal_mllib_accuracy(list):
    for i, clf in enumerate(list):
        #prediction with the features
        predictions = clf.predict(testData.map(lambda x: x.features))
        #append with lables first then features
        labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
        
        accuracy = labelsAndPredictions.filter(lambda (v, p): v == p).count()/testData.count()
    
        #compare results
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample")
    # $example on$
    # Load and parse the data file.
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={}, numIterations=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression GBT model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
    sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingRegressionModel")
    # $example off$
#       * **maxDepth** – Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 3)
#       * **maxBins** – maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories
#       
#       
# * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are:
#    * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context.
#    * `load(sc,path)` : The counterpart to save - load classifier from file.
#    * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints.
#    * `toDebugString()` : print the classifier in a human readable format.

errors={}
catInfo = {}
for i in range(10,54):
    catInfo[i] = 2
depth  = 13  
model=GradientBoostedTrees.trainClassifier(trainingData,categoricalFeaturesInfo=catInfo,maxDepth=depth,numIterations=13,learningRate = 0.15)
#print model.toDebugString()
errors[depth]={}
dataSets={'train':trainingData,'test':testData}
for name in dataSets.keys():  
	data=dataSets[name]
	Predicted=model.predict(data.map(lambda x: x.features))
	LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted) 
	Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
	errors[depth][name]=Err
print depth,errors[depth]




# coding: utf-8
all_data = np.array(zip(yy, xx))
sss = ShuffleSplit(len(all_data) - 1, test_size=0.20, random_state=1234)

for train_indexes, test_indexes in sss:
    lparr = []
    test_lp_arr = []
    sample_data = all_data[train_indexes]
    test_data = all_data[test_indexes]

    for medianvalue, record in sample_data:
        lp = LabeledPoint(medianvalue, tuple(record))
        lparr.append(lp)

    for medianvalue, record in test_data:
        lp = LabeledPoint(medianvalue, tuple(record))
        test_lp_arr.append(lp)

    training_data = sc.parallelize(lparr).cache()
    test_data_rdd = sc.parallelize(test_lp_arr).cache()

    regression_model = GradientBoostedTrees.trainRegressor(training_data, categoricalFeaturesInfo={}, numIterations=10,maxDepth=10)
    result = regression_model.predict(test_data_rdd.map(lambda x: x.features))
    print regression_model
    print regression_model.toDebugString()
    print "==============================="
    predicted_data = result.collect()
    actual_data = test_data_rdd.map(lambda x: float(x.label)).collect()

    print mean_absolute_error(actual_data, predicted_data)
    break
#model=SVMWithSGD.train(train, 1.0) 
### Change XCA
# TODO We are testing several MLs
# 1) LogisticsRegression
#model =LogisticRegressionwWithSGD.train(train)   This is used for Logistic regression classification

# 2) SVM Classification 
#model=SVMWithSGD.train(train)  This used for SVM classiffication

# 3) RandomForest 
#************Random forest model in pyspark is experimental so not sure whether works perfectly or not
#model=RandomForest.trainClassifier(train,2,{},300,seed=2)  here 300 is best solution as per literature for this dataset

##### from doc

#model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
#                                     numTrees=3, featureSubsetStrategy="auto",
#                                     impurity='gini', maxDepth=4, maxBins=32)

# Gradient Boost

model = GradientBoostedTrees.trainClassifier(train, categoricalFeaturesInfo={},
                                                 numIterations=30, maxDepth=4)
                                                 
print "retrieving predictions and evaluating"
predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
print "accuracy for GradientBoostedTrees:"+str(accuracy)


Example #46
0
# In[45]:

Data1 = Data.sample(False, 0.1, seed=255).cache()
(trainingData, testData) = Data1.randomSplit([0.7, 0.3], seed=255)

# print 'Sizes: Data1=%d, trainingData=%d, testData=%d'%(Data1.count(),trainingData.cache().count(),testData.cache().count())

# In[59]:

from time import time
errors = {}
for depth in [10]:
    model = GradientBoostedTrees.trainClassifier(Data1,
                                                 categoricalFeaturesInfo={},
                                                 numIterations=10,
                                                 maxDepth=depth,
                                                 learningRate=0.25,
                                                 maxBins=54)
    #print model.toDebugString()
    errors[depth] = {}
    dataSets = {'train': trainingData, 'test': testData}
    for name in dataSets.keys():  # Calculate errors on train and test sets
        data = dataSets[name]
        Predicted = model.predict(data.map(lambda x: x.features))
        LabelsAndPredictions = data.map(lambda lp: lp.label).zip(Predicted)
        Err = LabelsAndPredictions.filter(
            lambda (v, p): v != p).count() / float(data.count())
        errors[depth][name] = Err
    print depth, errors[depth]

# In[ ]:
def parsePoint2(line):
    values= [float(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])

#train data load
train_data_new = sc.textFile('/home/hduser/dataset.txt')
parsedData = train_data_new.map(parsePoint)
#test data load
test_data_new = sc.textFile('/home/hduser/testfile.txt')
test_final = test_data_new.map(parsePoint2)

# Split train and test
X_train, X_test = parsedData.randomSplit([0.8,0.2])

#train the classifier
model=GradientBoostedTrees.trainClassifier(X_train,categoricalFeaturesInfo={},numIterations=10)
#20% of training data
predictions=model.predict(X_test.map(lambda x: x.features))
labelsAndPredictions1 = X_test.map(lambda p: p.label).zip(predictions)

#test data
predictions1=model.predict(test_final.map(lambda x: x.features))
y_final = test_final.map(lambda p: p.label).zip(predictions1)


er =labelsAndPredictions1.filter(lambda (v, p): v != p).count() / float(X_train.count())
acc = (1 - er)*100
print('===============================================================')
print(model.toDebugString())
print('===============================================================')
for i in y_final.collect():
Example #48
0
print("Number of test set rows: %d" % test_data.count())

# COMMAND ----------

# MAGIC %md ### Train Gradient Boosted trees and Random Forest model

# COMMAND ----------

from pyspark.mllib.tree import RandomForest
from time import *
from pyspark.mllib.tree import GradientBoostedTrees

start_time = time()

# Train a model Gradient Boosted Trees
modelGBT = GradientBoostedTrees.trainClassifier(training_data,
                                                categoricalFeaturesInfo={})

end_time = time()
elapsed_time_GBT = end_time - start_time
print("Time to train GBT model: %.3f seconds" % elapsed_time_GBT)

# Train a model Random Forest
start_time = time()
model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, \
   numTrees=3, featureSubsetStrategy="auto", impurity="gini", \
   maxDepth=4, maxBins=32, seed=SEED)

end_time = time()
elapsed_time_RF = end_time - start_time
print("Time to train Random Forest model: %.3f seconds" % elapsed_time_RF)
Example #49
0
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.tree import RandomForest, RandomForestModel

from pyspark.mllib.util import MLUtils

# Read the file into an RDD
# If doing this on a real cluster, you need the file to be available on all nodes, ideally in HDFS.
path='/HIGGS/HIGGS.csv'
inputRDD=sc.textFile(path)

# Transform the text RDD into an RDD of LabeledPoints
Data=inputRDD.map(lambda line: [float(strip(x)) for x in line.split(',')])     .map(lambda x: LabeledPoint(x[0], x[1:]))

Data1=Data.sample(False,0.1, seed=255).cache()
(trainingData,testData)=Data1.randomSplit([0.7,0.3],seed = 255)
trainingData.cache()
testData.cache()

errors={}
depth = 10
model=GradientBoostedTrees.trainClassifier(trainingData, {}, numIterations=30, learningRate=0.3,  maxDepth=depth)
errors[depth]={}
dataSets={'train':trainingData,'test':testData}
for name in dataSets.keys():  # Calculate errors on train and test sets
    data=dataSets[name]
    Predicted=model.predict(data.map(lambda x: x.features))
    LabelsAndPredictions=data.map(lambda x: x.label).zip(Predicted)
    Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
    errors[depth][name]=Err
print depth,errors[depth]
Example #50
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
trainingData = trainingData.map(lambda x: LabeledPoint(x[0], x[1]))

# <=================================================================================================================>
# RandomForest Classifier
maxDepth_selection = [5, 10, 15, 20, 30]
maxBins_selection = [10, 20, 30, 40]
model_rf = RandomForest.trainClassifier(trainingData, numClasses = 8, \
                                     numTrees = 800, featureSubsetStrategy = "auto", \
                                     impurity = 'gini', maxDepth = 5, maxBins = 30)

predictions_rf = model_rf.predict(testData.map(lambda x: x[1]))
labelsAndPredictions_rf = testData.map(lambda x: x[0]).zip(predictions_rf)
testErr_rf = labelsAndPredictions_rf.filter(
    lambda (v, p): v != p).count() / float(testData.count())
print "Precision is " + testErr_rf

# <=================================================================================================================>
# Gradient Boost Decision Tree
# tunning order
learningRate_selection = [0.1, 0.2, 0.3]
maxDepth_selection = [5, 10, 15, 20, 30]

model_xgbt = GradientBoostedTrees.trainClassifier(trainingData, numClasses = 8, \
                                              loss = 'logLoss', numIterations = 800, \
                                              learningRate = 0.1, maxDepth = 10)
predictions_xgbt = model_xgbt.predict(testData.map(lambda x: x[1]))
labelsAndPredictions_xgbt = testData.map(lambda x: x[0]).zip(predictions_xgbt)
testErr_xgbt = labelsAndPredictions_xgbt.filter(
    lambda (v, p): v != p).count() / float(testData.count())
print "Precision is " + testErr_xgbt
Example #52
0
import sys
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

sc = SparkContext(appName="PythonWordCount")
data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim.txt')
traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_ssim.txt')
data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_720.txt')
data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_540.txt')
data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_360.txt')

model = GradientBoostedTrees.trainRegressor(traindata,
                                            categoricalFeaturesInfo={},
                                            numIterations=5)

predictions = model.predict(data.map(lambda x: x.features))
labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions)
MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(
    data.count())
print("training MSE = " + str(MSE))
labelsandpredictions.saveAsTextFile("/usr/hadoop/ssim_rbt")
predictions_720 = model.predict(data_720.map(lambda x: x.features))
labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip(
    predictions_720)
MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) *
                                       (v - p)).sum() / float(data_720.count())
print("training MSE_720 = " + str(MSE_720))
labelsandpredictions_720.saveAsTextFile("/usr/hadoop/ssim_720_rbt")
predictions_540 = model.predict(data_540.map(lambda x: x.features))
Example #53
0
        for x in featurs_raw:
            feature = float(x.strip().strip("'").strip())
            features.append(feature)

        label = float(fields[11])
        #print ("label=" + str(label))
        return LabeledPoint(label,features)

    data = sc.textFile("/Users/jiayangan/project/SearchAds/data/log/ctr_features_demo3/part*")
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    parsedTrainData = trainingData.map(parsePoint)
    parsedTestData = testData.map(parsePoint)

    # Train a GradientBoostedTrees model.
    #  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
    #         (b) Use more iterations in practice.
    model = GradientBoostedTrees.trainClassifier(parsedTrainData,
                                                 categoricalFeaturesInfo={}, numIterations=100,maxDepth=3)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(parsedTestData.map(lambda x: x.features))
    labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(parsedTestData.count())
    print('training Error = ' + str(testErr))
    print('Learned classification GBT model:')
    print(model.toDebugString())
    print("tree totalNumNodes" + str(model.totalNumNodes()))

    # Save and load model
    model.save(sc, "/Users/jiayangan/project/SearchAds/data/model/ctr_gbdt_model_demo_20")
Example #54
0
##### Trees
#####
##### Now let’s try three variants of tree-based classification. 
##### The API is slightly different from previous algos.
from pyspark.mllib.tree import DecisionTree

from pyspark.mllib.tree import GradientBoostedTrees

from  pyspark.mllib.tree import RandomForest

algo = DecisionTree()
model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={})
score(model)


algo = GradientBoostedTrees()
model = algo.trainClassifier(training_data,categoricalFeaturesInfo={},numIterations=10)
score(model)

algo = RandomForest()
model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16)
score(model)

#### Naive Bayes
#### Last but not least, let’s try the Naives Bayes classifier.
from pyspark.mllib.classification import NaiveBayes
algo = NaiveBayes()
model = algo.train(training_data)
score(model)

Example #55
0
#       * **maxBins** – maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories
#
#
# * `GradientBoostedTreesModel` represents the output of the boosting process: a linear combination of classification trees. The methods supported by this class are:
#    * `save(sc, path)` : save the tree to a given filename, sc is the Spark Context.
#    * `load(sc,path)` : The counterpart to save - load classifier from file.
#    * `predict(X)` : predict on a single datapoint (the `.features` field of a `LabeledPont`) or an RDD of datapoints.
#    * `toDebugString()` : print the classifier in a human readable format.

# In[32]:

from time import time
errors={}
for depth in [10]:
    start=time()
    model=GradientBoostedTrees.trainClassifier(trainingData, {},maxDepth=depth, numIterations=30)##FILLIN to generate 10 trees ##)
    #print model.toDebugString()
    errors[depth]={}
    dataSets={'train':trainingData,'test':testData}
    for name in dataSets.keys():  # Calculate errors on train and test sets
        data=dataSets[name]
        Predicted=model.predict(data.map(lambda x: x.features))
        LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) ### FILLIN ###
        Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
        errors[depth][name]=Err
    print depth,errors[depth]#,int(time()-start),'seconds'
#print errors


# In[33]:
Err = 0.0
results = []
for train_index, test_index in ss:
	X_training, Y_training, X_test, Y_test = [], [], [], []
	for i in train_index:
		X_training.append(X[i])
		Y_training.append(Y[i])
	for i in test_index:
		X_test.append(X[i])
		Y_test.append(Y[i])
		
	parsedData = []
	for i in range(0, len(X_training)):
		parsedData.append(LabeledPoint(Y_training[i], X_training[i]))
		
	model = GradientBoostedTrees.trainClassifier(sc.parallelize(parsedData), {}, numIterations=10)
		
	testErr = 0
	for i in range(0, len(X_test)):
		a = Y_test[i]
		b = model.predict(X_test[i])
		#b = 1
		if a != b:
			testErr += 1
		
	Err += float(testErr) / float(len(X_test))

	 
print ("AVG test error: %.6f" % 
	(Err/iter_number))
Example #57
0
# In[6]:

Data1=Data.sample(False,0.1, seed=255).cache()
(trainingData,testData)=Data1.randomSplit([0.7,0.3],seed=255)


# ###Gradient Boosted Trees

# In[7]:

from time import time
errors={}
for depth in [10]:
    model=GradientBoostedTrees.trainClassifier(Data1,
                                             categoricalFeaturesInfo={}, numIterations=10,
					     maxDepth=depth, learningRate=0.25, maxBins=35)
    #print model.toDebugString()
    errors[depth]={}
    dataSets={'train':trainingData,'test':testData}
    for name in dataSets.keys():  # Calculate errors on train and test sets
        data=dataSets[name]
        Predicted=model.predict(data.map(lambda x: x.features))
        LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted)
        Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
        errors[depth][name]=Err
    print depth,errors[depth]


# In[ ]:
Example #58
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
from pyspark import SparkConf, SparkContext
SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\")
import sys, pickle,math
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('random-forest')
sc = SparkContext(conf=conf)

input = sys.argv[1]

# Load and parse the data
def parsePoint(line):
    return LabeledPoint(float(line[1]), line[0])

train = sc.pickleFile(input+'/bow_train/part-00000')
test = sc.pickleFile(input+'/bow_test/part-00000')
parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0)
parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache()
model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1)
predictions = model.predict(parsedtest.map(lambda x: x.features))
labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions)
val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count())
parsedtest.unpersist()
RMSE=math.sqrt(val_err)

print("Root Mean Squared Error Test= " + str(RMSE))