def main():
    records = get_records()
    records.cache()

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2,10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])

    data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings)))
    data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))


    dt_model = DecisionTree.trainRegressor(data_dt, {})
    preds = dt_model.predict(data_dt.map(lambda p: p.features))
    actual = data.map(lambda p: p.label)
    true_vs_predicted_dt = actual.zip(preds)

    data_dt_log = data_dt.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features))
    dt_model_log = DecisionTree.trainRegressor(data_dt_log, {})

    preds_log = dt_model_log.predict(data_dt_log.map(lambda p: p.features))
    actual_log = data_dt_log.map(lambda p: p.label)
    true_vs_predicted_dt_log = actual_log.zip(preds_log).map(lambda (t, p): (np.exp(t), np.exp(p)))

    calculate_print_metrics("Decision Tree Log", true_vs_predicted_dt_log)
def decisionTreeRegression(trainingData, testData, trainingSize, testSize):
    '''
  decision tree for regression
  '''
    # parameter range
    maxDepthValList = [10, 20, 30]
    maxBinsValList = [16, 24, 32]

    # best parameters
    bestMaxDepthVal = 5
    bestMaxBinsVal = 16
    bestTrainingRMSE = 1e10

    for maxDepthVal, maxBinsVal in itertools.product(maxDepthValList,
                                                     maxBinsValList):
        model = DecisionTree.trainRegressor(trainingData,
                                            categoricalFeaturesInfo={},
                                            impurity='variance',
                                            maxDepth=maxDepthVal,
                                            maxBins=maxBinsVal)
        predictions = model.predict(trainingData.map(lambda x: x.features))
        ValsAndPreds = trainingData.map(lambda x: x.label).zip(predictions)
        trainingRMSE = math.sqrt(
            ValsAndPreds.map(lambda (v, p):
                             (v - p)**2).reduce(lambda x, y: x + y) /
            trainingSize)
        if trainingRMSE:
            if trainingRMSE < bestTrainingRMSE:
                bestMaxDepthVal = maxDepthVal
                bestMaxBinsVal = maxBinsVal
                bestTrainingRMSE = trainingRMSE
        print maxDepthVal, maxBinsVal, trainingRMSE
    print bestMaxDepthVal, bestMaxBinsVal, bestTrainingRMSE

    model = DecisionTree.trainRegressor(trainingData,
                                        categoricalFeaturesInfo={},
                                        impurity='variance',
                                        maxDepth=bestMaxDepthVal,
                                        maxBins=bestMaxBinsVal)

    # evaluating the model on training data
    predictions = model.predict(trainingData.map(lambda x: x.features))
    ValsAndPreds = trainingData.map(lambda x: x.label).zip(predictions)
    trainingRMSE = math.sqrt(
        ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)
        / trainingSize)
    print trainingRMSE

    # evaluating the model on test data
    predictions = model.predict(testData.map(lambda x: x.features))
    ValsAndPreds = testData.map(lambda x: x.label).zip(predictions)
    testRMSE = math.sqrt(
        ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)
        / testSize)
    print testRMSE
    pass
def main():
    records = get_records()
    records.cache()

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len
    print "Feature vector length for categorical features: %d" % cat_len
    print "Feature vector length for numerical features: %d" % num_len
    print "Total feature vector length: %d" % total_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))

    data_dt = records.map(
        lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
    first_point_dt = data_dt.first()
    print "Decision Tree feature vector: " + str(first_point_dt.features)
    print "Decision Tree feature vector length: " + str(
        len(first_point_dt.features))

    dt_model = DecisionTree.trainRegressor(data_dt, {})
    preds = dt_model.predict(data_dt.map(lambda p: p.features))
    actual = data.map(lambda p: p.label)
    true_vs_predicted_dt = actual.zip(preds)
    print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
    print "Decision Tree depth: " + str(dt_model.depth())
    print "Decision Tree number of nodes: " + str(dt_model.numNodes())

    calculate_print_metrics("Decision Tree", true_vs_predicted_dt)
Esempio n. 4
0
def trainEvaluateModel(trainData, validationData, impurityParm, maxDepthParm,
                       maxBinsParm):
    '''
    训练模型时会输入不同的参数。其中,DecisionTree参数有impurity、maxDepth、maxBins等的值都会影响准确率以及训练所需的时间。
    我们以图表显示这些参数值、准确率与训练所需的时间。
    我们每次只会评估单个参数的不同值,例如评估maxDepth参数的不同值[3, 5, 10, 15, 20, 25],执行步骤如下:
    (1)用DecisionTree.trainRegressor进行训练传入trainData与单个参数的不同数值;
    (2)建立模型后,用validationData评估模型的RMES准确率;
    (3)训练与评估模型重复执行多次,产生多个参数项的RMES与运行时间,并存储于metricsRDD中;
    (4)全部执行完成后,将metricsRDD转换为Pandas DataFrame;
    (5)Pandas DataFrame可绘制RMES与运行时间图表,用于显示不同参数的准确率与执行时间的关系。
    :param trainData:
    :param validationData:
    :param impurityParm:
    :param maxDepthParm:
    :param maxBinsParm:
    :return:
    '''
    print('======================= 训练评估模型 =======================')
    startTime = time()
    model = DecisionTree.trainRegressor(trainData,
                                        categoricalFeaturesInfo={},
                                        impurity=impurityParm,
                                        maxDepth=maxDepthParm,
                                        maxBins=maxBinsParm)
    RMES = evaluateModel(model, validationData)
    duration = time() - startTime
    print('========== [trainEvaluateModel] >>>> 训练评估模型:使用参数:impurity=' +
          str(impurityParm) + ', maxDepth=' + str(maxDepthParm) +
          ', maxBins=' + str(maxBinsParm) + '\n' + '\t\t==>> 所需时间=' +
          str(duration) + ', 结果RMES=' + str(RMES))
    return (RMES, duration, impurityParm, maxDepthParm, maxBinsParm, model)
Esempio n. 5
0
def evaluate_dt(train,test,maxDepth,maxBins):
    model = DecisionTree.trainRegressor(train,{},impurity = 'variance',maxDepth = maxDepth,maxBins = maxBins)
    preds = model.predict(test.map(lambda p:p.features))
    actual = test.map(lambda p:p.label)
    tp = actual.zip(preds)
    rmsle = np.sqrt(tp.map(lambda (t,p):squared_log_error(t,p)).mean())
    return rmsle
def main():

    records = get_records()
    records.cache()

    print "Mapping of first categorical feature column: %s" % get_mapping(
        records, 2)

    # extract all the catgorical mappings
    mappings = [get_mapping(records, i) for i in range(2, 10)]
    cat_len = sum(map(len, mappings))
    num_len = len(records.first()[11:15])
    total_len = num_len + cat_len

    data = records.map(lambda r: LabeledPoint(
        extract_label(r), extract_features(r, cat_len, mappings)))

    data_dt = records.map(
        lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
    cat_features = dict([(i - 2, len(get_mapping(records, i)) + 1)
                         for i in range(2, 10)])
    print "Categorical feature size mapping %s" % cat_features
    # train the model again
    dt_model = DecisionTree.trainRegressor(
        data_dt, categoricalFeaturesInfo=cat_features)
    preds = dt_model.predict(data_dt.map(lambda p: p.features))
    actual = data.map(lambda p: p.label)
    true_vs_predicted_dt = actual.zip(preds)

    calculate_print_metrics("Decision Tree Categorical Features",
                            true_vs_predicted_dt)
Esempio n. 7
0
def evaluate_final(description, data, maxDepth, maxBins):
    data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k))
    test = data_with_idx.sample(False, 0.13, 63) #.13 WAS working to get 10..., for 10% ..., 63 is the seed
    train = data_with_idx.subtractByKey(test)
    train_data = train.map(lambda (idx, p): p) #train_size = train_data.count()
    test_data = test.map(lambda (idx, p) : p) #test_size = test_data.count()

    dt_model = DecisionTree.trainRegressor(train_data,{}, maxDepth=maxDepth, maxBins=maxBins)
    preds = dt_model.predict(test_data.map(lambda p: p.features))
    actual = test_data.map(lambda p: p.label)
    true_vs_predicted_dt = actual.zip(preds)

    print '\r\n-------- ' + description + ' ---------'
    print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
    print "Decision Tree depth: " + str(dt_model.depth())
    print "Decision Tree number of nodes: " + str(dt_model.numNodes())
    
    mse_dt = true_vs_predicted_dt.map(lambda (t, p): squared_error(t, p)).mean()
    mae_dt = true_vs_predicted_dt.map(lambda (t, p): abs_error(t, p)).mean()
    rmsle_dt = np.sqrt(true_vs_predicted_dt.map(lambda (t, p): squared_log_error(t, p)).mean())
    
    print 'Decision Tree -Mean Squared Error: {0:2.4f}'.format(mse_dt)   
    print 'Decision Tree -Root Mean Squared Error: {0:2.4f}'.format(np.sqrt(mse_dt))  
    print 'Decision Tree -Mean Absolute Error: {0:2.4f}'.format(mae_dt)
    print 'Decision Tree -Root Mean Squared Log Error: {0:2.4f}'.format(rmsle_dt)
Esempio n. 8
0
def trainEvaluateModel(trainData,validationData,impurityParm, maxDepthParm, maxBinsParm):
    model = DecisionTree.trainRegressor(trainData,
                categoricalFeaturesInfo={},
                impurity=impurityParm,
                maxDepth=maxDepthParm,
                maxBins=maxBinsParm)
    return model
Esempio n. 9
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
Esempio n. 10
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]))
        except ValueError:
            self.fail()
Esempio n. 11
0
def Regression_Model(filename):
    open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data(
        filename)
    output = []
    for i in range(1, len(Date)):
        tmp = LabeledPoint(label=True_price_train[i],
                           features=[close_price_train[i]])
        output.append(tmp)

    output_train_RDD = sc.parallelize(output).cache()
    lrm = LinearRegressionWithSGD.train(output_train_RDD,
                                        step=0.001,
                                        iterations=100000)
    tree = DecisionTree.trainRegressor(output_train_RDD,
                                       categoricalFeaturesInfo={},
                                       impurity='variance',
                                       maxDepth=5,
                                       maxBins=30)
    forest = RandomForest.trainRegressor(output_train_RDD,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='variance',
                                         maxDepth=5,
                                         maxBins=30)
    gradient = GradientBoostedTrees.trainRegressor(output_train_RDD,
                                                   categoricalFeaturesInfo={},
                                                   numIterations=10)

    print("\n============MODEL Evaluation=============\n")
    model_name = [
        'LinearRegression', 'DecisionTree', 'RandomForest',
        'GradientBoostedTrees'
    ]
    es_modelname = ['lrm', 'tree', 'forest', 'gradient']
    result = ''
    x = 0
    err = 1000
    test_model = 'LinearRegression'
    #此处更换不同的RDD
    output_model_RDD = lrm
    for model in [lrm, tree, forest, gradient]:
        predictions = model.predict(output_train_RDD.map(lambda x: x.features))
        labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip(
            predictions)
        MSE = (
            labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /
            float(output_train_RDD.count()))**0.5
        #print ("Predictions: ", valuesAndPreds.take(10))
        result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n"
        if (err > MSE):
            err = MSE
            output_model = model
            es_model = es_modelname[x]
        x += 1
    print(result)
    print(es_model)
    return Date, True_price, output_model_RDD, open_price, close_price, es_model
Esempio n. 12
0
def evaluate_dt(train, test, maxDepth, maxBins):
    model = DecisionTree.trainRegressor(train, {},
                                        impurity='variance',
                                        maxDepth=maxDepth,
                                        maxBins=maxBins)
    preds = model.predict(test.map(lambda p: p.features))
    actual = test.map(lambda p: p.label)
    tp = actual.zip(preds)
    rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean())
    return rmsle
Esempio n. 13
0
def trainEvaluateModel(trainData, validationData, impurityParam, maxDepthParam,
                       maxBinsParam):
    startTime = time()
    model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, \
        impurity=impurityParam, maxDepth=maxDepthParam, maxBins=maxBinsParam)
    RMSE = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:impurity->", impurityParam, ", maxDepth->", maxDepthParam,
          ", maxBins->", maxBinsParam)
    print("==> 所需时间:", duration, "s , RMSE=", RMSE)
    return (RMSE, duration, impurityParam, maxDepthParam, maxBinsParam, model)
Esempio n. 14
0
def evaluate_dt(train, test, maxDepth, maxBins):
    dtModel = DecisionTree.trainRegressor(train, {},
                                          impurity='variance',
                                          maxDepth=maxDepth,
                                          maxBins=maxBins)
    preds = dtModel.predict(test.map(lambda p: p.features))
    actual = test.map(lambda p: p.label)
    actual_vs_pred = actual.zip(preds)
    #print actual_vs_pred.take(10)
    #print "decision tree depth: %d" % dtModel.depth()
    #print "decision tree number of nodes: %d" % dtModel.numNodes()
    return actual_pred_error(actual_vs_pred)
Esempio n. 15
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Esempio n. 16
0
def trainEvaluationModel(trainData, validationData, impurityParm, maxDepthParm, maxBinsParm):
    startTime = time()
    model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={},
                                        impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm)
    RMSE = evaluateModel(model, validationData)
    duration = time() - startTime
    print("训练评估:使用参数 " + \
         " impurity = " + str(impurityParm) + \
         " maxDepth = " + str(maxDepthParm) + \
         " maxBins = " + str(maxBinsParm) + \
         " ==> 所需时间 = " + str(duration) + " 秒"\
         " 结果 RMSE = %f" %RMSE)
    return RMSE, duration, impurityParm, maxDepthParm, maxBinsParm, model
def trainEvaluateModel(trainData, validationData, impurtyParam, maxDepthParam,
                       maxBinsParam):
    starttime = time()
    model = DecisionTree.trainRegressor(data=trainData,
                                        categoricalFeaturesInfo={},
                                        impurity=impurtyParam,
                                        maxDepth=maxDepthParam,
                                        maxBins=maxBinsParam)
    RMSE = evaluateModel(model, validationData)  #均方根误差
    duration = time() - starttime
    print("训练评估使用参数:\n", "impurity=", impurtyParam, "\n maxDepth=",
          maxDepthParam, "\n maxBins=", maxBinsParam, "====>用时=", duration,
          "\n 结果AUC=", RMSE)
    return (RMSE, duration, impurtyParam, maxDepthParam, maxBinsParam, model)
Esempio n. 18
0
def regression(sc, sample):

    traindata = sc.parallelize(sample)
    traindata = traindata.map(lambda x: LabeledPoint(x[1], x[0]))
    testdata = [8.2]
    #####
    #    linear_model = LinearRegressionWithSGD.train(traindata,iterations=10)
    #    prediction = linear_model.predict(testdata)
    #    print prediction

    #####
    decision_model = DecisionTree.trainRegressor(traindata, {})
    prediction = decision_model.predict(testdata)
    print prediction
Esempio n. 19
0
def TrainEvaluateModel(trainData,validationData,
                       impurityParm,maxDepthParm,maxBinsParm):
    startTime = time()
    model = DecisionTree.trainRegressor(trainData,
                         categoricalFeaturesInfo={}, impurity=impurityParm, maxDepth=maxDepthParm,
                         maxBins=maxBinsParm)
    RMSE = EvaluateModel(model, validationData)
    duration = time() - startTime
    print("Evaluate the model: use the params: " + \
         "impurity=" + str(impurityParm) + \
         " maxDepthParm=" + str(maxDepthParm) + \
         " maxBinsParm=" + str(maxBinsParm) + "\n" + \
         "====> duration time = " + str(duration) + \
         " result RMSE = " + str(RMSE))
    return (RMSE, duration, impurityParm, maxDepthParm, maxBinsParm, model)
Esempio n. 20
0
def regression(sc, sample):

    traindata = sc.parallelize(sample)
    traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0]))
    testdata = [8.2]
    #####
#    linear_model = LinearRegressionWithSGD.train(traindata,iterations=10)
#    prediction = linear_model.predict(testdata)
#    print prediction


    #####
    decision_model = DecisionTree.trainRegressor(traindata,{})
    prediction = decision_model.predict(testdata)
    print prediction
def train_evaluate_model(train_data, valid_data, impurity, max_depth,
                         max_bins):
    start_time = time()
    # 训练
    model = DecisionTree.trainRegressor(train_data,
                                        categoricalFeaturesInfo={},
                                        impurity=impurity,
                                        maxDepth=max_depth,
                                        maxBins=max_bins)
    # 评估
    # y_pred y_true
    RMSE = evaluate_model(model, valid_data)
    duration = time() - start_time
    print(f"训练评估:使用参数 impurity={impurity}, maxDepth={max_depth},"\
          f"maxBins={max_bins},==>所需时间={duration} 结果RMSE = {RMSE}")
    return RMSE, duration, impurity, max_depth, max_bins, model
    def test_all(self, measure_columns=None, dimension_columns=None):
        measures = measure_columns[0]
        self._target_column = measures
        #dimension = dimension_columns[0]
        all_dimensions = self._dimension_columns
        all_measures = list(x for x in self._measure_columns if x != measures)
        cat_feature_info = []
        #columns_without_dimension = list(x for x in all_dimensions if x != dimension)
        columns_without_dimension = all_dimensions
        mapping_dict = {}
        masterMappingDict = {}
        decision_tree_result = DecisionTreeResult()
        for column in all_dimensions:
            mapping_dict[column] = dict(enumerate(self._data_frame.select(column).distinct().rdd.map(lambda x: str(x[0])).collect()))
        # for c in mapping_dict:
        #     name = c
        #     reverseMap = {v: k for k, v in mapping_dict[c].iteritems()}
        #     udf = UserDefinedFunction(lambda x: reverseMap[x], StringType())
        #     self._data_frame = self._data_frame.select(*[udf(column).alias(name) if column == name else column for column in self._data_frame.columns])

        # converting spark dataframe to pandas for transformation and then back to spark dataframe
        pandasDataFrame = self._data_frame.toPandas()
        for key in mapping_dict:
            pandasDataFrame[key] = pandasDataFrame[key].apply(lambda x: 'None' if x==None else x)
            reverseMap = {v: k for k, v in mapping_dict[key].items()}
            pandasDataFrame[key] = pandasDataFrame[key].apply(lambda x: reverseMap[x])
        # sqlCtx = SQLContext(self._spark)
        self._data_frame = self._spark.createDataFrame(pandasDataFrame)
        self._mapping_dict = mapping_dict
        for c in columns_without_dimension:
            cat_feature_info.append(self._data_frame.select(c).distinct().count())
        if len(cat_feature_info)>0:
            max_length = max(cat_feature_info)
        else:
            max_length=32
        cat_feature_info = dict(enumerate(cat_feature_info))
        #dimension_classes = self._data_frame.select(dimension).distinct().count()
        self._data_frame = self._data_frame[[measures] + columns_without_dimension + all_measures]
        data = self._data_frame.rdd.map(lambda x: LabeledPoint(x[0], x[1:]))
        (trainingData, testData) = data.randomSplit([1.0, 0.0])
        # TO DO : set maxBins at least equal to the max level of categories in dimension column
        model = DecisionTree.trainRegressor(trainingData,  categoricalFeaturesInfo=cat_feature_info, impurity='variance', maxDepth=3, maxBins=max_length)
        output_result = model.toDebugString()
        decision_tree = self.tree_json(output_result, self._data_frame)
        self.generate_probabilities(decision_tree, measures)
        decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability)
        return decision_tree_result
Esempio n. 23
0
def trainEvaluateModel(trainData, validationData,
                       impurityParm, maxDepthParm, maxBinsParm):
    startTime = time()
    model = DecisionTree.trainRegressor(trainData,
                                        categoricalFeaturesInfo={},
                                        impurity=impurityParm,
                                        maxDepth=maxDepthParm,
                                        maxBins=maxBinsParm)
    RMSE = evaluateModel(model, validationData)
    duration = time() - startTime
    print "訓練評估:使用參數" + \
        " impurityParm= %s" % impurityParm + \
        "  maxDepthParm= %s" % maxDepthParm + \
        "  maxBinsParm = %d." % maxBinsParm + \
        "  所需時間=%d" % duration + \
        "  結果RMSE = %f " % RMSE
    return (RMSE, duration, impurityParm, maxDepthParm, maxBinsParm, model)
Esempio n. 24
0
def dealData(path):
    rawData = sc.textFile(path + 'hour.csv')
    header = rawData.first()
    rData = rawData.filter(lambda x: x != header)

    lines = rData.map(lambda x: x.split(","))
    labelpointRDD = lines.map(
        lambda r: LabeledPoint(process_label(r), process_features(r)))
    print(labelpointRDD.first())
    # 划分训练集、验证集和测试集
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([7, 1, 2])
    print("训练集样本个数:" + str(trainData.count()) + " 验证集样本个数:" +
          str(validationData.count()) + " 测试集样本个数:" + str(testData.count()))

    # 将数据暂存在内存中,加快后续运算效率
    trainData.persist()
    validationData.persist()
    testData.persist()

    model = DecisionTree.trainRegressor(trainData,
                                        categoricalFeaturesInfo={},
                                        impurity="variance",
                                        maxDepth=5,
                                        maxBins=32,
                                        minInstancesPerNode=1,
                                        minInfoGain=0.0)

    rmse = RMSE(model, validationData)
    print("均方误差RMSE=" + str(rmse))

    ## 评估参数 maxDepth
    maxDepthList = [3, 5, 10, 15, 20, 25]
    maxBinsList = [10]
    minInstancesPerNodeList = [1]
    minInfoGainList = [0.0]

    ## 返回结果存放至metries中
    metrics = [
        trainEvaluateModel(trainData, validationData, maxDepth, maxBins,
                           minInstancesPerNode, minInfoGain)
        for maxDepth in maxDepthList for maxBins in maxBinsList
        for minInstancesPerNode in minInstancesPerNodeList
        for minInfoGain in minInfoGainList
    ]
Esempio n. 25
0
    def test_regression(self):
        from pyspark.mllib.regression import (
            LinearRegressionWithSGD,
            LassoWithSGD,
            RidgeRegressionWithSGD,
        )
        from pyspark.mllib.tree import DecisionTree

        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})),
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Esempio n. 26
0
def trainEvaluateModel(trainData, validationData, maxDepthParm, maxBinsParm,
                       minInstancesPerNodeParm, minInfoGainParm):
    startTime = time.time()
    # 创建并训练模型
    model = DecisionTree.trainRegressor(
        trainData,
        categoricalFeaturesInfo={},
        impurity="variance",
        maxDepth=maxDepthParm,
        maxBins=maxBinsParm,
        minInstancesPerNode=minInstancesPerNodeParm,
        minInfoGain=minInfoGainParm)
    # 计算RMSE
    rmse = RMSE(model, validationData)
    duration = time.time() - startTime  # 持续时间
    print("训练评估:参数" + ",  maxDepth=" + str(maxDepthParm) + ",  maxBins=" +
          str(maxBinsParm) + ", minInstancesPerNode=" +
          str(minInstancesPerNodeParm) + ", minInfoGainParm=" +
          str(minInfoGainParm) + "\n"
          "===>消耗时间=" + str(duration) + ",  均方误差RMSE=" + str(rmse))
    return rmse, duration, maxDepthParm, maxBinsParm, minInstancesPerNodeParm, minInfoGainParm, model
Esempio n. 27
0
def train_model():
    data = get_dataset()
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    metrics_combos = []

    bins = [x for x in range(50, 500, 10)]
    depths = [x for x in range(4, 12)]
    for numBin in bins:
        for depth in depths:
            model = DecisionTree.trainRegressor(trainingData,
                                                categoricalFeaturesInfo={},
                                                impurity='variance',
                                                maxDepth=depth,
                                                maxBins=numBin)

            predictions = model.predict(testData.map(lambda x: x.features))
            labelsAndPredictions = testData.map(lambda lp: lp.label).zip(
                predictions)
            metrics = RegressionMetrics(labelsAndPredictions)
            metrics_combos.append(((numBin, depth), metrics.meanSquaredError))

    print(sorted(metrics_combos, key=lambda s: s[1]))
Esempio n. 28
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = \
            DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Esempio n. 29
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
    def getPredictionsLabels(model, test):
        predictions = model.predict(test.map(lambda r: r.features))
        return predictions.zip(test.map(lambda r: r.label))

    def printMetrics(predictions_and_labels):
        metrics = RegressionMetrics(predictions_and_labels)
        f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance))
        f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError))
        f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError))
        f.write('Root Mean Squared Error:{0}\n'.format(
            metrics.rootMeanSquaredError))
        f.write('R^2 :{0}\n'.format(metrics.r2))

    for j in range(numModels):
        regp = paramGrid[j]['regParam']
        iters = paramGrid[j]['iterations']
        regt = paramGrid[j]['regType']
        con = paramGrid[j]['convergenceTol']

        #f.write('Model{0}: regParam = {1}, iterations = {2}, regType = {3}, convergenceTol = {4}\n'.format(str(j), regp, iters, regt, con))
        # Train decision tree regression model with hypermarameter set
        model = DecisionTree.trainRegressor(training, categoricalFeaturesInfo = {}, impurity='variance', \
            maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0)

        predictions_and_labels = getPredictionsLabels(model, test)
        printMetrics(predictions_and_labels)

    f.close()
    sc.stop()
	
	#ArrDelay is our response
	#ArrDelay becomes the 8tth column now, and total columns in the data = 12
	label = clean_line_split[0]
	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Narrow-2008")
Esempio n. 32
0
# get 90% train and 10% test data
data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test = data_with_idx.sample(False, 0.1)
train = data_with_idx.subtractByKey(test)
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p): p)
train_size = train_data.count()
test_size = test_data.count()
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)

# make decision tree model
dt_model = DecisionTree.trainRegressor(train_data, {})

# make predictions and measure error
preds = dt_model.predict(test_data.map(lambda p: p.features))
actual = test_data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())


def squared_error(actual, pred):
    return (pred - actual)**2


def squared_log_error(pred, actual):
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

sc = SparkContext(appName="PythonWordCount")
data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt')
traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt')
data_720 = MLUtils.loadLibSVMFile(sc,
                                  '/usr/hadoop/para_avg_halfsqrsum_720.txt')
data_540 = MLUtils.loadLibSVMFile(sc,
                                  '/usr/hadoop/para_avg_halfsqrsum_540.txt')
data_360 = MLUtils.loadLibSVMFile(sc,
                                  '/usr/hadoop/para_avg_halfsqrsum_360.txt')

model = DecisionTree.trainRegressor(traindata,
                                    categoricalFeaturesInfo={},
                                    impurity='variance',
                                    maxDepth=5,
                                    maxBins=32)

predictions = model.predict(data.map(lambda x: x.features))
labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions)
MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(
    data.count())
print("training MSE = " + str(MSE))
#labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_dt")
predictions_720 = model.predict(data_720.map(lambda x: x.features))
labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip(
    predictions_720)
MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) *
                                       (v - p)).sum() / float(data_720.count())
print("training MSE_720 = " + str(MSE_720))
data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))
first_point_dt = data_dt.first()

first_point_dt.label
first_point_dt.features
len(first_point_dt.features)

from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.tree import DecisionTree

linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
true_vs_predicted.take(5)

dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data_dt.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
true_vs_predicted_dt.take(5)
dt_model.depth()
dt_model.numNodes()

def squared_error(actual, pred):
	return (pred - actual) ** 2
def abs_error(actual, pred):
    return np.abs(pred - actual)
def squared_log_error(actual, pred):
    return (np.log(pred + 1) - np.log(actual + 1)) ** 2

true_vs_predicted.map(lambda t: squared_error(t[0], t[1])).mean()
Esempio n. 35
0
def learn(examples,depth,bin):
    global model
    model = DecisionTree.trainRegressor(examples, categoricalFeaturesInfo={},
                                        impurity='variance', maxDepth=depth, maxBins=bin)
Esempio n. 36
0
def evaluate_dt(train, test, maxDepth, maxBins):
    dt_model = DecisionTree.trainRegressor(train, categoricalFeaturesInfo={0:4},impurity='variance', maxDepth=maxDepth, maxBins=maxBins)
    dt_predictions = dt_model.predict(test.map(lambda x: x.features))
    dt_labelsAndPredictions = test.map(lambda lp: lp.label).zip(dt_predictions)
    dt_testMSE = dt_labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
    return dt_testMSE
Esempio n. 37
0
# get 90% train and 10% test data
data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test = data_with_idx.sample(False, 0.1)
train = data_with_idx.subtractByKey(test)
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p) : p)
train_size = train_data.count()
test_size = test_data.count()
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)

# make decision tree model 
dt_model = DecisionTree.trainRegressor(train_data,{})

# make predictions and measure error
preds = dt_model.predict(test_data.map(lambda p: p.features))
actual = test_data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())

def squared_error(actual, pred): 
	return (pred - actual)**2

def squared_log_error(pred, actual):
	return (np.log(pred + 1) - np.log(actual + 1))**2
Esempio n. 38
0
# MAGIC %md DecisionTree performs best when it is told which features are categorical.  We constructor a map categoricalFeaturesInfo to pass this information to DecisionTree.
# MAGIC If DecisionTree is not given this info, then it will treat all features as continuous.

# COMMAND ----------

# Construct a map for categorical features:
#   categoricalFeaturesInfo[column index] = number of categories
categoricalFeaturesInfo = {}
for j in xrange(numFeatures):
  col = featureCols[j]
  if col in categoryIndexes:
    categoricalFeaturesInfo[j] = len(categoryIndexes[col])

# COMMAND ----------

initialModel = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo)
initialModel

# COMMAND ----------

# We can print the full model, but it can be hard to parse when the tree is large.
print initialModel.toDebugString()

# COMMAND ----------

# MAGIC %md We now compute the error of the DecisionTreeModel on the training dataset.  We use Root Mean Squared Error (RMSE) as our error metric.
# MAGIC 
# MAGIC Denote (y_i, x_i) as the (label, feature vector) for instance i, and write model.predict(x_i) as our model's predicted label for instance i.  RMSE is defined as:
# MAGIC 
# MAGIC %[ RMSE(dataset) = \left[ \mathbf{avg}_{(y_i, x_i) \in dataset} \left( y_i - model.predict(x_i) \right)^2 \right]^{1/2} ]%
Esempio n. 39
0
def run_decision_tree(userid):
	conf = SparkConf().setMaster("local[1]").setAppName("heart-disease-prediction-descision-tree")
	sc   = SparkContext(conf=conf)

	print "Running Spark Version %s" % (sc.version)


	# https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
	path = "/home/raju/Documents/hdp_proj"
	heartdf_tr = pd.read_csv(path+"processed.cleveland.data.csv",header=None)
	heartdf_test = pd.read_csv(path+"testdata.csv",header=None)
	print "Original training Dataset (Rows:Colums): "
	print heartdf_tr.shape
	print heartdf_test.shaperead_csvread_csvread_csv

	print "Categories of Diagnosis of heart disease (angiographic disease status) that we are predicting"
	print "-- Value 0: < 50% diameter narrowing"
	print "-- Value 1: > 50% diameter narrowing "
	print heartdf_tr.ix[:,13].unique() #Column containing the Diagnosis of heart disease
	print heartdf_test.ix[:,13].unique() #Column containing the Diagnosis of heart disease

	newheartdf = pd.concat([heartdf_tr.ix[:,13], heartdf_tr.ix[:,0:12]],axis=1, join_axes=[heartdf_tr.index])
	newheartdf_test = pd.concat([heartdf_test.ix[:,13], heartdf_test.ix[:,0:12]],axis=1, join_axes=[heartdf_test.index])
	newheartdf.replace('?', np.nan, inplace=True) # Replace ? values
	newheartdf_test.replace('?', np.nan, inplace=True) # Replace ? values

	print "After dropping rows with anyone empty value (Rows:Columns): "
	ndf2 = newheartdf.dropna()
	ndf_test = newheartdf_test.dropna()

	ndf2.to_csv(path+"new-heart-disease-cleaveland.txt",sep=",",index=False,header=None,na_rep=np.nan)
	ndf_test.to_csv(path+"new-heart-disease-cleaveland-test.txt",sep=",",index=False,header=None,na_rep=np.nan)

	print ndf2.shape
	print ndf_test.shape
	print ndf2.ix[:5,:]
	print ndf_test.ix[:5,:]

	print "Create a Labeled point which is a local vector, associated with a label/response"

	points = sc.textFile(path+'new-heart-disease-cleaveland.txt')
	points_test = sc.textFile(path+'new-heart-disease-cleaveland-test.txt')

	print "###############################Something"
	parsed_data = points.map(parsePoint)
	parsed_data_test = points_test.map(parsePoint)

	print 'After parsing, number of training lines: %s' %parsed_data.take(5)  #parsed_data.count()
	print 'After parsing, number of test data lines: %s' %parsed_data_test.take(5)  #parsed_data.count()


	#####Perform Classification using a Decision Tree#####
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, trainingData1) = parsed_data.randomSplit([1,0])
	(testData , testData1) = parsed_data_test.randomSplit([1,0])
	# Train a DecisionTree model.
	#  Empty categoricalFeaturesInfo indicates all features are continuous. 
	print "+++++++++++++++++++++++++++++++++ Perform Classification using a Decision Tree +++++++++++++++++++++++++++++++++"
	model = DecisionTree.trainClassifier(trainingData, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=32)

	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('=================== Learned classification tree model ====================')
	print(model.toDebugString())


	print "+++++++++++++++++++++++++++++++++ Perform Regression using a Decision Tree +++++++++++++++++++++++++++++++++"
	model1 = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=4, maxBins=32)

	####### Evaluate model on test instances and compute test error########
	predictions = model1.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
	print('Test Mean Squared Error = ' + str(testMSE))
	print('================== Learned regression tree model ====================')
	print(model1.toDebugString())
	print(userid)
	input_data = get_input_data(userid[-20:-2])
	#features = vector.dense(result)
	prediction_value = model1.predict(input_data)
	print(prediction_value)
	post_prediction(userid[-20:-2],prediction_value)
Esempio n. 40
0
from pyspark.mllib.tree import DecisionTree

def loadRecord(line):
    input = StringIO.StringIO(line)
    reader = csv.reader(input)
    row = map(float, reader.next())
    return LabeledPoint(row[-1],row[:-1]) 

chf = open('data/CAhousing.csv','r')
header = chf.next().rstrip("\n").split(",")
for i,j in enumerate(header):
    print "%d: %s" % (i,j)

chrdd = sc.parallelize(chf).map(lambda line: loadRecord(line))
chrdd.persist()

(trainingData, testData) = chrdd.randomSplit([0.7, 0.3])


model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    impurity='variance', minInstancesPerNode=2500)

predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())

with open("trunk.txt", "w") as f:
    f.write('Test Mean Squared Error = ' + str(testMSE))
    f.write('Learned regression tree model:')
    f.write( model.toDebugString() )
Esempio n. 41
0

data = records.map(
    lambda r: LabeledPoint(extract_label(r), extract_features(r)))
first_point = data.first()
print("Raw data :")
print(first[2:])
print("Label")
print(first_point.label)
print("decision tree model feature vector :")
print(first_point.features)
print("decision tree model feature vector length :" +
      str(len(first_point.features)))

#
dt_model = DecisionTree.trainRegressor(data, {})
preds = dt_model.predict(data.map(lambda d: d.features))
actual = data.map(lambda d: d.label)
true_vs_predicted = actual.zip(preds)
print("decision tree prediction :" + str(true_vs_predicted.take(5)))
print("decision tree depth :" + str(dt_model.depth()))
print("decision tree number of nodes :" + str(dt_model.numNodes()))


#
def squared_error(actual, prediction):
    return (actual - prediction)**2


def abs_error(actual, prediction):
    return np.abs(actual - prediction)
from sklearn.cross_validation import LeaveOneOut
from sklearn.cross_validation import KFold

# Kfold
if __name__ == "__main__":
	sc = SparkContext('local',appName="Prediction")
	import fileinput
	data_y1, data_y2 = [], []
	for line in fileinput.input("data/feature_extracted_class3.txt"):
		data_y1.append(LabeledPoint(float(1 if int(line.split("\t")[2])!=0 else 0), [float(i) for i in line.split("\t")[3:]]))
		data_y2.append(LabeledPoint(int(line.split("\t")[2]), [float(i) for i in line.split("\t")[3:]]))
	total, right, mse = 0, 0, []
	for t in xrange(10):
		kf = KFold(32*40, n_folds=10)
		for train, test in kf:
			data_train_y1, data_train_y2 = [], []
			for i in train:
				data_train_y1.append(data_y1[i])
				data_train_y2.append(data_y2[i])
			clf1 = DecisionTree.trainClassifier(sc.parallelize(data_train_y1), numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100)
			clf2 = DecisionTree.trainRegressor(sc.parallelize(data_train_y2), categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=100)
			for i in test:
				data_test_y1, data_test_y2 = data_y1[i], data_y2[i]
				r1 = clf1.predict(data_test_y1.features)
				r2 = clf2.predict(data_test_y2.features)
				if r1 == data_test_y1.label:
					right += 1
				mse.append(abs(r2-data_test_y2.label))
				total += 1
	print float(right)/total, sum(mse)/len(mse)
Esempio n. 43
0
print "Decision Tree feature vector length: " + str(
    len(first_point_tree.features))

# In[167]:

from pyspark.mllib.tree import DecisionTree

#from the RDD sample 20% for training and rest for test
records_tree_with_idx = data_tree.zipWithIndex().map(lambda (k, v): (v, k))
test_tree_idx = records_tree_with_idx.sample(False, 0.2, 42)
training_tree_idx = records_tree_with_idx.subtractByKey(test_tree_idx)

test_tree = test_tree_idx.map(lambda (idx, p): p)
training_tree = training_tree_idx.map(lambda (idx, p): p)

model_tree = DecisionTree.trainRegressor(training_tree, {})

preds_tree = model_tree.predict(test_tree.map(lambda p: p.features))
actual_tree = test_tree.map(lambda p: p.label)
true_vs_predicted_tree = actual_tree.zip(preds_tree)

print "Decision Tree predictions: " + str(true_vs_predicted_tree.take(5))
print "Decision Tree depth: " + str(model_tree.depth())
print "Decision Tree number of nodes: " + str(model_tree.numNodes())

# In[177]:

mse_tree = true_vs_predicted_tree.map(lambda
                                      (t, p): squared_error(t, p)).mean()
mae_tree = true_vs_predicted_tree.map(lambda (t, p): abs_error(t, p)).mean()
help(DecisionTree.trainRegressor)


# ## Train a Regression Model on the Bike Sharing Dataset

# In[9]:

linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False)
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
print "Linear Model predictions: " + str(true_vs_predicted.take(5))


# In[10]:

# we pass in an mepty mapping for categorical feature size {}
dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())


# ## Perfomance Metrics

# In[11]:

# set up performance metrics functions 

def squared_error(actual, pred):
def extract_label(record):
	return float(record[17])

data_dt = RDD.map(lambda r: LabeledPoint(extract_label(r),extract_features_dt(r)))
first_point_dt = data_dt.first()
print "Decision Tree feature vector: " + str(first_point_dt.features)
print "Decision Tree feature vector length: " + str(len(first_point_dt.features))
	

training_dt, test_dt = data_dt.randomSplit([0.9, 0.1])
print "trainging_dt count = ", training_dt.count()
print "test_dt count = ", test_dt.count()

print "###########Start decision tree using Spark MLLib ################"
from pyspark.mllib.tree import DecisionTree
dt_model = DecisionTree.trainRegressor(training_dt,{})
preds = dt_model.predict(test_dt.map(lambda p: p.features))
actual = test_dt.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.collect())
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())
# data_dt.saveAsTextFile("file:///home/cloudera/MMZ/FinalProject/temp/temp_training_data_dt")

def squared_error(actual, pred): # Mean Squared Error (MSE)
	return (pred - actual)**2
def abs_error(actual, pred): # Mean Absolute Error (MAE)
	return np.abs(pred - actual)	
def squared_log_error(pred, actual): # Root Mean Squared Log Error (RMSE)
	return (np.log(pred + 1) - np.log(actual + 1))**2
mse_dt = true_vs_predicted_dt.map(lambda (t, p): squared_error(t, p)).mean()
summary = Statistics.colStats(testvecData)
variance = summary.variance()[0]
# compute the pseudo R-square
test_Rsqr1 = 1 - testMSE1/float(variance)





# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# use variance as impurity for regression
# maxDepth is the maximum number of level for each tree
model2 = DecisionTree.trainRegressor(trainparsedData
									, categoricalFeaturesInfo={}
									, impurity='variance'
									, maxDepth=8
									, maxBins=32)


# evaluate the training error
# first make the prediction and create a new "vector" of all the predictions
trainpredictions = model2.predict(trainparsedData.map(lambda x: x.features))
# then you column bind the prediction and actual values into a new RDD
trainlabelsAndPredictions = trainparsedData.map(lambda lp: lp.label).zip(trainpredictions)
# use map operation to compute MSE
trainMSE2 = trainlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(trainparsedData.count())

# use the the Statistics library to obtain the variance
summary = Statistics.colStats(trainvecData)
variance = summary.variance()[0]
Esempio n. 47
0

# In[22]:

for i,x in enumerate(features): print i,x


# In[23]:


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = d2.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    impurity="variance", maxDepth=6, maxBins=12)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())


# In[24]:

# 
plt.xlabel("response")
plt.ylabel("prediction")
Esempio n. 48
0
print '决策树特征向量长度: ' + str(len(first_point_dt.features))

from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.tree import DecisionTree

help(LinearRegressionWithSGD.train)

linear_model = LinearRegressionWithSGD.train(data,
                                             iterations=10,
                                             step=0.1,
                                             intercept=False)
true_vs_predicted = data.map(
    lambda point: (point.label, linear_model.predict(point.features)))
print '线性回归模型对前5个样本的预测值: ' + str(true_vs_predicted.take(5))

dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print '决策树回归模型对前5个样本的预测值: ' + str(true_vs_predicted_dt.take(5))
print '决策树模型的深度: ' + str(dt_model.depth())
print '决策树模型的叶子节点个数: ' + str(dt_model.numNodes())


def squared_error(actual, pred):
    return (pred - actual)**2


def abs_error(actual, pred):
    return np.abs(pred - actual)