Beispiel #1
0
def test_multi_target_random_forest():
    import shap
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor

    X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.linnerud(),
                                                   test_size=0.2,
                                                   random_state=0)
    est = RandomForestRegressor(random_state=202,
                                n_estimators=10,
                                max_depth=10)
    est.fit(X_train, Y_train)
    predicted = est.predict(X_test)

    explainer = shap.TreeExplainer(est)
    expected_values = np.asarray(explainer.expected_value)
    assert len(
        expected_values
    ) == est.n_outputs_, "Length of expected_values doesn't match n_outputs_"
    shap_values = np.asarray(explainer.shap_values(X_test)).reshape(
        est.n_outputs_ * X_test.shape[0], X_test.shape[1])
    phi = np.hstack((shap_values, np.repeat(expected_values,
                                            X_test.shape[0]).reshape(-1, 1)))
    assert np.allclose(phi.sum(1), predicted.flatten(order="F"), atol=1e-4)
Beispiel #2
0
def spark_ml():
    diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID'))
    diff_cat_in_train_test.distinct().count()
    
    from pyspark.ml.feature import StringIndexer
    plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID')
    labeller = plan_indexer.fit(train)
    Train1 = labeller.transform(train)
    Test1 = labeller.transform(test)
    Train1.show()
    from pyspark.ml.feature import RFormula
    formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label")
    t1 = formula.fit(Train1)
    train1 = t1.transform(Train1)
    test1 = t1.transform(Test1)
    train1.show()
    train1.select('features').show()
    train1.select('label').show()
    from pyspark.ml.regression import RandomForestRegressor
    rf = RandomForestRegressor()
    (train_cv, test_cv) = train1.randomSplit([0.7, 0.3])
    model1 = rf.fit(train_cv)
    predictions = model1.transform(test_cv)
    from pyspark.ml.evaluation import RegressionEvaluator
    evaluator = RegressionEvaluator()
    mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" })
    import numpy as np
    np.sqrt(mse), mse
    model = rf.fit(train1)
    predictions1 = model.transform(test1)
    df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase')
    df.toPandas().to_csv('submission.csv')
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(description='Pyspark Training')
    parser.add_argument(
        '--data',
        type=str,
        default="../../../data/sample_linear_regression_data.txt",
        help='Data location.')
    args = parser.parse_args()

    data = spark.read.format("libsvm").load(args.data)

    # Split the data into training and test sets (30% held out for testing)
    (train, test) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestRegressor()

    # Train model.  This also runs the indexer.
    model = rf.fit(train)

    # Make predictions.
    predictions = model.transform(test)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")

    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
Beispiel #4
0
def randomForestRegression(df,arguments):
	from pyspark.ml.regression import RandomForestRegressor
	maxDepth = 5
	minInstancesPerNode = 1
	numTrees = 20
	impurity = "variance"

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.numTrees != None:
		numTrees = float(arguments.numTrees)

	if arguments.impurity != None:
		impurity = arguments.impurity

	rf =  RandomForestRegressor(numTrees=numTrees,
								maxDepth=maxDepth,
								minInstancesPerNode=minInstancesPerNode,
								impurity=impurity)
	model = rf.fit(df)

	return model
Beispiel #5
0
def main():
    # Set bounds for random forest's hyperparameters
    hparams = [(2, 25),   # num_trees
               (2, 6),    # max_depth
               (15, 30)]  # max_bins
    
    # Run hyperparameter optimization using Gaussian processes
    optim_results = gp_minimize(objective, hparams, n_calls=20, verbose=True, random_state=0)
    
    print('\nHyperparameter Optimization Results:')
    print('Best validation RMSE = {}'.format(optim_results.fun))

    # Get best hyperparameters from optimization
    num_trees = optim_results.x[0]
    max_depth = optim_results.x[1]
    max_bins = optim_results.x[2]
    
    # Instantiate a RandomForest model using best hyperparameter settings
    rf = RandomForestRegressor(numTrees=num_trees, maxDepth=max_depth, maxBins=max_bins)

    # Train model. 
    model = rf.fit(train)

    # Make predictions.
    predictions = model.transform(test)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")

    rmse = evaluator.evaluate(predictions)
    
    print('\nFinal Results on Test Set with Optimized Hyperparameters:')
    print("Root Mean Squared Error on test set = %g" % rmse)
 def _getBasePredictor(self, randomSeed):
     f = open(self._baseDataPath, "w")
     f.truncate()
     f.close()
     self._lowTrainData = self._trainData.sample(fraction=self._lowRatio,
                                                 seed=randomSeed).cache()
     self._midTrainData = self._trainData.sample(fraction=self._midRatio,
                                                 seed=randomSeed).cache()
     cs = self.getPCS()
     scenario = Scenario({
         "run_obj": "quality",
         "runcount-limit": self._BPDS,
         "cs": cs,
         "deterministic": "true"
     })
     # Optimize, using a SMAC-object
     smac = SMAC(scenario=scenario,
                 rng=np.random.RandomState(42),
                 tae_runner=self._baseEval)
     smac.optimize()
     df = self._spark.read.format("libsvm").load(self._baseDataPath)
     rf = RandomForestRegressor()
     rfModel = rf.fit(df)
     self._lowTrainData.unpersist()
     self._midTrainData.unpersist()
     return rfModel
Beispiel #7
0
def model_dev_rf(df_train, df_test, n_trees, max_bins, max_depth):

    rf_start_time = time()

    # Create an Initial Model Instance
    mod_rf = RandomForestRegressor(labelCol='label',
                                   featuresCol='features',
                                   impurity='variance',
                                   featureSubsetStrategy='all',
                                   numTrees=n_trees,
                                   maxBins=max_bins,
                                   maxDepth=max_depth)

    # Training The Model
    rf_final_model = mod_rf.fit(df_train)

    # Scoring The Model On Test Sample
    rf_transformed = rf_final_model.transform(df_test)
    rf_test_results = rf_transformed.select(['prediction', 'label'])

    # Collecting The Model Statistics
    rf_evaluator = RegressionEvaluator(predictionCol="prediction",
                                       labelCol="label")
    rf_r2 = round(
        rf_evaluator.evaluate(rf_test_results,
                              {rf_evaluator.metricName: "r2"}), 3)
    rf_mse = round(
        rf_evaluator.evaluate(rf_test_results,
                              {rf_evaluator.metricName: "mse"}), 3)
    rf_rmse = round(
        rf_evaluator.evaluate(rf_test_results,
                              {rf_evaluator.metricName: "rmse"}), 3)
    rf_mae = round(
        rf_evaluator.evaluate(rf_test_results,
                              {rf_evaluator.metricName: "mae"}), 3)

    # Printing The Model Statitics
    print("\n++++++ Printing Random Forest Model Accuracy ++++++\n")
    print("R Square: " + str(rf_r2 * 100) + "%")
    print("Mean Squared Error: " + str(rf_mse))
    print("Root Mean Squared Error: " + str(rf_rmse))
    print("Mean Absolute Error: " + str(rf_mae))

    rf_end_time = time()
    rf_elapsed_time = (rf_end_time - rf_start_time) / 60
    rf_model_stat = pd.DataFrame({
        "Model Name": ["Random Forest"],
        "R Square": rf_r2,
        "Mean Squared Error": rf_mse,
        "Root Mean Squared Error": rf_rmse,
        "Mean Absolute Error": rf_mae,
        "Time (Min.)": round(rf_elapsed_time, 3)
    })
    rf_output = (rf_final_model, rf_model_stat)

    return (rf_output)
def testRegression(train, test):
    # Train a RandomForest model.
    # Note: Use larger numTrees in practice.

    rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4)

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)
def testRegression(train, test):
    # Train a RandomForest model.
    # Note: Use larger numTrees in practice.

    rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4)

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)
Beispiel #10
0
def RF(trainingData, testData):
    """
        Random Forest Tree Regression Model
        :param trainingData:
        :param testData:
        :param args
        :return: Trained model, predictions, nt (int), md (int)
        """
    nt,md=120,20
    rf = RandomForestRegressor( numTrees=nt, featureSubsetStrategy="auto",\
                               impurity='variance', maxDepth=md, maxBins=100) #120,20
    model = rf.fit(trainingData)
    predictions = model.transform(testData)
    return model, predictions, nt, md
Beispiel #11
0
def RF(trainingData, testData, args):
    """
    Random Forest Tree Regression Model
    :param trainingData:
    :param testData:
    :param args
    :return: Trained model, predictions, nt (int), md (int)
    """
    if (args.descriptor == 'CM' or args.descriptor == 'CMSE'
            or args.descriptor == 'Morgan2DCMSE'):
        nt, md = 50, 14
    elif (args.descriptor == 'Morgan2D' or args.descriptor == 'Morgan2DSE'
          or args.descriptor == 'Morgan2DSEext'):
        nt, md = 120, 20
    rf = RandomForestRegressor( numTrees=nt, featureSubsetStrategy="auto",\
                                    impurity='variance', maxDepth=md, maxBins=100) #120,20
    model = rf.fit(trainingData)
    predictions = model.transform(testData)
    return model, predictions, nt, md
Beispiel #12
0
def rfRegressor(df):
    df = df.withColumn('tmp_price', df['price'])
    df = df.drop('price')
    df = df.withColumnRenamed('tmp_price', 'price')

    feature_label = df.rdd.map(lambda x: (Vectors.dense(
        [float(i) for i in x[0:-1]]), float(x[-1]))).toDF(
            ["features", "label"])

    (trainingData, testData) = feature_label.randomSplit([0.7, 0.3])

    rf = RandomForestRegressor()

    model = rf.fit(trainingData)

    importance_map_df = importance_features_map(df, model, 'price')

    # Make predictions.
    predictions = model.transform(testData)
    predict_df = predictions.select("prediction", "label")
    predict_df = predict_df.withColumn(
        'rate',
        (predict_df['prediction'] - predict_df['label']) / predict_df['label'])

    def udf_rate(s):
        return round(abs(s), 3)

    udf_rate = udf(udf_rate)

    predict_df = predict_df.select(
        '*',
        udf_rate(predict_df['rate']).alias('rates')).drop('rate')

    predict_df.show()

    model.save("/root/myModelPath1")
    sameModel = RandomForestRegressionModel.load("/root/myModelPath1")

    same_predict_df = sameModel.transform(testData)
    print('=======================================')
    same_predict_df.show()

    return importance_map_df, model
Beispiel #13
0
    def featureAnalysis(self, etlStats, algoName):

        numericalFeatures = etlStats.get(PredictiveConstants.NUMERICALFEATURES)
        label = etlStats.get(PredictiveConstants.LABELCOLM)
        dataset = etlStats.get(PredictiveConstants.DATASET)
        featuresColm = etlStats.get(PredictiveConstants.FEATURESCOLM)
        indexedFeatures = etlStats.get(PredictiveConstants.INDEXEDFEATURES)
        maxCategories = etlStats.get(PredictiveConstants.MAXCATEGORIES)
        categoricalFeatures = etlStats.get(
            PredictiveConstants.CATEGORICALFEATURES)

        trainData, testData = dataset.randomSplit([0.80, 0.20], seed=40)

        keyStatsTest = ''
        statisticalTestResult = {}
        randomForestModel = object
        if algoName == PredictiveConstants.RANDOMREGRESSOR:
            statisticalTestObj = PredictiveStatisticalTest(
                dataset=dataset, features=numericalFeatures, labelColm=label)
            statisticalTestResult = statisticalTestObj.pearsonTest()
            randomForestModel = \
                RandomForestRegressor(labelCol=label,
                                      featuresCol=featuresColm,
                                      numTrees=10)
            keyStatsTest = "pearson_test_data"
        if algoName == PredictiveConstants.RANDOMCLASSIFIER:
            statisticalTestObj = PredictiveStatisticalTest(
                dataset=dataset, features=indexedFeatures, labelColm=label)
            statisticalTestResult = \
                statisticalTestObj.chiSquareTest(categoricalFeatures=categoricalFeatures,
                                                 maxCategories=maxCategories)
            randomForestModel = RandomForestClassifier(
                labelCol=label, featuresCol=featuresColm, numTrees=10)
            keyStatsTest = "ChiSquareTestData"
        randomForestModelFit = randomForestModel.fit(trainData)

        featureAnalysis = {
            PredictiveConstants.RANDOMFORESTMODEL: randomForestModelFit,
            PredictiveConstants.KEYSTATSTEST: keyStatsTest,
            PredictiveConstants.STATISTICALTESTRESULT: statisticalTestResult
        }
        return featureAnalysis
    def randomForestRegressorModel(self):
        randomForestRegressorModelFit = \
            RandomForestRegressor(labelCol=self.labelColm,
                                  featuresCol=self.featuresColm,
                                  numTrees=10,predictionCol=self.modelSheetName)
        regressor = randomForestRegressorModelFit.fit(self.trainData)
        # predictionData = regressor.transform(self.testData)

        regressionStat = self.randomGradientRegressionModelEvaluation(regressor=regressor)

        # persisting model
        modelName = "randomForestModel"
        extention = ".parquet"
        modelStorageLocation = self.locationAddress + self.userId.upper() + modelName.upper() + extention
        regressor.write().overwrite().save(modelStorageLocation)

        regressionStat["modelPersistLocation"] = {"modelName": modelName,
                                                  "modelStorageLocation": modelStorageLocation}

        return regressionStat
Beispiel #15
0
def objective(hparams):
    """
    Objective function to be minimized:
    Model validation RMSE loss as a function of our model hyperparameters.

    Parameters:
    ----------
    * `hparams` [list]
        Hyperparameter settings determined by Bayesian optimization loop.

    Returns:
    -------
    * `rmse` [float]
        Root mean squared error on the validation set    

    Reference:
    ---------
    Bayesian optimization with Scikit-Optimize:
    https://scikit-optimize.github.io/
    """
    # New hyperparameter settings from Bayesian optimization
    num_trees, max_depth, max_bins = hparams
    
    # Instantiate a RandomForest model.
    rf = RandomForestRegressor(numTrees=num_trees, maxDepth=max_depth, maxBins=max_bins)

    # Train model. 
    model = rf.fit(train)

    # Make predictions.
    predictions = model.transform(val)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    
    rmse = evaluator.evaluate(predictions)
    #print('Validation RMSE: {}'.format(rmse))
    return rmse
def build_random_forest_regressor_model(observation_df, feature_columns):
    # Create new column with all of the features
    vector_observation_df = create_feature_column(observation_df,
                                                  feature_columns,
                                                  ['features', 'duration_sec'])

    train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3])

    lr = RandomForestRegressor(featuresCol='features', labelCol='duration_sec')
    rfr_model = lr.fit(train_df)

    test_predictions = rfr_model.transform(test_df)
    test_predictions.select("prediction", "duration_sec", "features").show(5)

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol="duration_sec",
                                                  metricName="accuracy")
    print("RMSE on test data = %g" % evaluator.evaluate(test_predictions))

    # test_result = rfr_model.evaluate(test_df)

    return rfr_model
Beispiel #17
0
def RandomForestRegressor():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
    model = rf.fit(df)
    model.featureImportances
    # SparseVector(1, {0: 1.0})
    allclose(model.treeWeights, [1.0, 1.0])
    # True
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    model.transform(test0).head().prediction
    # 0.0
    model.numFeatures
    # 1
    model.trees
    # [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...]
    model.getNumTrees
    # 2
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    model.transform(test1).head().prediction
    # 0.5
    temp_path = "./"
    rfr_path = temp_path + "/rfr"
    rf.save(rfr_path)
    rf2 = RandomForestRegressor.load(rfr_path)
    rf2.getNumTrees()
    # 2
    model_path = temp_path + "/rfr_model"
    model.save(model_path)
    model2 = RandomForestRegressionModel.load(model_path)
    model.featureImportances == model2.featureImportances
    def randomForestRegression(self, regressionInfo):
        etlStats = self.etlOperation(etlInfo=regressionInfo)

        featuresColm = etlStats.get(PredictiveConstants.FEATURESCOLM)
        modelName = regressionInfo.get(PredictiveConstants.MODELSHEETNAME)
        labelColm = etlStats.get(PredictiveConstants.LABELCOLM)
        trainData = etlStats.get(PredictiveConstants.TRAINDATA)
        locationAddress = regressionInfo.get(
            PredictiveConstants.LOCATIONADDRESS)
        modelId = regressionInfo.get(PredictiveConstants.MODELID)

        randomForestRegressorModelFit = \
            RandomForestRegressor(labelCol=labelColm,
                                  featuresCol=featuresColm,
                                  numTrees=10, predictionCol=modelName)
        regressor = randomForestRegressorModelFit.fit(trainData)
        # predictionData = regressor.transform(self.testData)

        regressionStat = self.regressionEvaluation(
            regressor=regressor,
            regressionInfo=regressionInfo,
            etlStats=etlStats)

        # persisting model
        modelNameLocal = "randomForestModel"
        extention = ".parquet"
        modelStorageLocation = locationAddress + modelId.upper(
        ) + modelNameLocal.upper() + extention
        regressor.write().overwrite().save(modelStorageLocation)

        regressionStat["modelPersistLocation"] = {
            "modelName": modelNameLocal,
            "modelStorageLocation": modelStorageLocation
        }

        return regressionStat
Beispiel #19
0
df = df.selectExpr("fare_amount as label", 'pickup_longitude',
                   'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
                   'passenger_count')

new_df = vecAssembler.setHandleInvalid("skip").transform(df)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = new_df.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestRegressor()

# Train model.
start_time = datetime.now()

model = rf.fit(trainingData)

time_elapsed = datetime.now() - start_time
print('TIME OF RANDOM FOREST TRAINING (hh:mm:ss.ms) {}'.format(time_elapsed))

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")
rmse = evaluator.evaluate(predictions)
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list,
                     relation):
    try:
        # dataset = spark.read.parquet(dataset_add)
        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=';')

        dataset.show()

        label = ''
        for y in label_colm:
            label = y

        print(label)
        #
        # summaryList = ['mean', 'stddev', 'min', 'max']
        # summaryDict = {}
        # for colm in feature_colm:
        #     summaryListTemp = []
        #     for value in summaryList:
        #         summ = list(dataset.select(colm).summary(value).toPandas()[colm])
        #         summaryListTemp.append(summ)
        #     varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
        #     summaryListTemp.append(varianceListTemp)
        #     summaryDict[colm] = summaryListTemp
        # summaryList.append('variance')
        # summaryDict['summaryName'] = summaryList
        #
        # print(summaryDict)

        # print(summaryDict)
        # varianceDict = {}
        # for colm in feature_colm:
        #     varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
        #     varianceDict[colm] = varianceListTemp
        # print(varianceDict)

        # summaryAll = {'summaryDict': summaryDict, 'varianceDict': varianceDict}
        # print(summaryAll)

        # extracting the schema

        schemaDataset = dataset.schema

        stringFeatures = []
        numericalFeatures = []

        for x in schemaDataset:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        stringFeatures.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        numericalFeatures.append(x.name)

        print(stringFeatures)
        print(numericalFeatures)

        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}
        for colm in numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListTemp.append(summ)
            varianceListTemp = list(
                dataset.select(variance(
                    col(colm)).alias(colm)).toPandas()[colm])
            summaryListTemp.append(varianceListTemp)
            summaryDict[colm] = summaryListTemp
        summaryList.append('variance')
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = stringFeatures
        print(summaryDict)

        # print(val)

        if relation == 'linear':
            dataset = dataset
        if relation == 'non_linear':
            dataset = Relationship(dataset, relation_list)

        # calling pearson test fuction

        response_pearson_test = Correlation_test_imp(
            dataset=dataset, features=numericalFeatures, label_col=label)

        # dataset = dataset.withColumnRenamed(label , 'indexed_'+ label)

        # dataset_pearson = dataset

        #
        # label_indexer = StringIndexer(inputCol=label, outputCol='indexed_'+label).fit(dataset)
        # dataset = label_indexer.transform(dataset)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+colm], outputCols=['encoded_'+colm]).fit(dataset)
            # encoded_features.append('encoded_'+colm)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = numericalFeatures + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        #using the vector indexer

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=4).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select(label, 'vec_indexed_features')
        finalized_data.show()

        # renaming the colm
        # print (label)
        # dataset.withColumnRenamed(label,"label")
        # print (label)
        # dataset.show()

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)
        #
        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()
        #
        #
        #
        # splitting the dataset into taining and testing

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        rf = RandomForestRegressor(labelCol=label,
                                   featuresCol='vec_indexed_features',
                                   numTrees=10)

        # Convert indexed labels back to original labels.

        # Train model.  This also runs the indexers.
        model = rf.fit(train_data)

        # Make predictions.
        predictions = model.transform(test_data)

        # Select example rows to display.
        # predictions.select("prediction", "label", "features").show(10)

        print(model.featureImportances)
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)

        features_column_for_user = numericalFeatures + stringFeatures

        feature_imp = {
            'feature_importance': feature_importance,
            "feature_column": features_column_for_user
        }

        response_dict = {
            'feature_importance': feature_imp,
            'pearson_test_data': response_pearson_test,
            'summaryDict': summaryDict
        }

        return response_dict
        print(response_dict)

        # Select (prediction, true label) and compute test error
        # evaluator = MulticlassClassificationEvaluator(
        #     labelCol="label", predictionCol="prediction", metricName="accuracy")
        # accuracy = evaluator.evaluate(predictions)
        # print("Test Error = %g" % (1.0 - accuracy))

        # rfModel = model.stages[2]
        # print(rfModel)  # summary only

    except Exception as e:
        print("exception is  = " + str(e))
Beispiel #21
0
###

from pyspark.ml.feature import PCA
print("pca")
df = PCA(k=300, inputCol="tfidf",
         outputCol="pca").fit(df).transform(df).select("pca", "overall")
df.show()
#df.show(truncate=False)

###

from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(numTrees=50,
                           maxDepth=5,
                           seed=42,
                           labelCol='overall',
                           featuresCol='pca',
                           predictionCol='prediction')
model = rf.fit(df)
pred = model.transform(df)

pred.show()

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="overall", predictionCol="prediction")
print("r2", evaluator.evaluate(pred, {evaluator.metricName: "r2"}))
print("mse", evaluator.evaluate(pred, {evaluator.metricName: "mse"}))
    def _train_model_spark(self, data):
        df = self._prepare_data_spark(data)
        input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                                self.TODAY_PRICE}))

        if self.ann_hidden_nodes_num is None:
            self.ann_hidden_nodes_num = input_num / 2 + 1
        ann_layers = [input_num,
                      # input_num / 3 * 2,
                      # input_num / 3,
                      self.ann_hidden_nodes_num,
                      2]

        self.logger.info('layer settings are {}'.format(ann_layers))
        self.logger.info('training method is {}'.format(self._train_method))
        self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
        if isinstance(self._train_method, dict):
            if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                self._model[self.CHANGE_AMOUNT].stop_server()
            self._model = {self.CHANGE_AMOUNT: None,
                           self.CHANGE_DIRECTION: None}

            if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                      maxIter=self.linear_regression_training_times,
                                      regParam=self.linear_regression_regularization_parameter,
                                      predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = lr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth,
                                            predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                                          num_workers=self.spark_worker_numbers,
                                                                          epoch=self.ann_epoch_number,
                                                                          featuresCol="features",
                                                                          labelCol=self.CHANGE_AMOUNT,
                                                                          predictionCol='AmountPrediction'
                                                                          )
                self._model[self.CHANGE_AMOUNT].fit(df)
            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

            if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
                lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                        maxIter=self.logistic_regression_training_times,
                                        regParam=self.linear_regression_regularization_parameter,
                                        predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = lr.fit(df)
            elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
                rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                             numTrees=self.random_forest_tree_number,
                                             maxDepth=self.random_forest_tree_max_depth,
                                             predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = rfc.fit(df)

            elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 2
                mlpc = MultilayerPerceptronClassifier(featuresCol="features",
                                                      labelCol=self.CHANGE_DIRECTION,
                                                      layers=ann_layers,
                                                      predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        else:
            if self._train_method == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
                                      regParam=self.linear_regression_regularization_parameter,
                                      maxIter=self.linear_regression_training_times)
                self._model = lr.fit(df)
            elif self._train_method == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
                                            predictionCol='prediction',
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth)
                self._model = rfr.fit(df)

            elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                if self._model is not None:
                    self._model.stop_server()
                self.logger.warn('layers are {}'.format(ann_layers))
                self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                      num_workers=self.spark_worker_numbers, epoch=100,
                                                      featuresCol="features", labelCol=self.TARGET_PRICE,
                                                      predictionCol='prediction'
                                                      )
                self._model.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        return self._model
run.log("Max Bins", maxBins)
run.log("Number of Trees", numTrees)
run.log('Subsampling Rate', subsamplingRate)
run.log_list("Feature Columns", feature_cols)

###############
# TRAIN MODEL #
###############

print("  * Training {0} model".format(model_name))
# Instantiate New RandomForestRegressor Object
rf = RandomForestRegressor(labelCol='duration_minutes', maxDepth=maxDepth, maxBins=maxBins, impurity='variance', 
                           subsamplingRate=1.0, seed=random_seed, numTrees=numTrees, featureSubsetStrategy='auto')

# Train model on transformed training data
rf_model = rf.fit(trainDF_transformed)

rf_full_model = feature_model.copy()
rf_full_model.stages.append(rf_model)

print("  * Model trained, scoring validation data")
# Run the full model (feature steps and trained model)
validation_scored = rf_full_model.transform(validDF)

#####################
# MODEL PERFORMANCE #
#####################

print("  * Calculating performance metrics")
# Calculate Regression Performance
rmse = evaluator.evaluate(validation_scored, {evaluator.metricName: "rmse"})
Beispiel #24
0
    #Drop nulls
    modelDF = assembler.transform(df7.dropna())
    #Cast label to double for regression model
    modelDF = modelDF.withColumn("label", modelDF["count"].cast("double"))

    #Split train and test set
    (train, test) = modelDF.randomSplit([0.8, 0.2])

    #Build Random Forest Model
    rf_mod = RandomForestRegressor(featuresCol="features",
                                   labelCol="label",
                                   numTrees=100,
                                   maxDepth=4,
                                   maxBins=40)
    fitted = rf_mod.fit(train)

    #Get predictions for test set, and round them to integer values because it's a count.
    predictions = fitted.transform(test)
    predictions = predictions.withColumn("predictions",
                                         round(predictions.prediction, 0))
    evaluator = RegressionEvaluator(predictionCol="predictions",
                                    labelCol="label",
                                    metricName="r2")

    pred = predictions.select("label", "predictions", "features").toPandas()
    #Save predictions
    pred.to_csv('gupta_3_predictions.csv', index=False)
    with open('gupta_3.txt', 'w') as output:
        output.write("Test R-Squared = " +
                     str(evaluator.evaluate(predictions)))
        arr.T, columns=['X_columns', 'importances_values'])

    return importance_map_df


start = time.time()
parquet_path = '/user/limeng/data/ganji_daxing.parquet'
df, columns_list = read_parquet(parquet_path)

print('=====================')
df.show()

(trainingData, testData) = df.randomSplit([0.7, 0.3])
rf = RandomForestRegressor(numTrees=20, maxDepth=15, impurity="variance")
print('model_train_start======================')
model = rf.fit(trainingData)
model.save('/user/limeng/data/ganji_daxing_RF_model')

#model = RandomForestRegressionModel.load('/user/limeng/data/fangtianxia_daxing_RF_model')
predict_value = model.transform(testData)
print('predict==============')
predict_value.show(truncate=False)

predict_value_rate = predict_value.rdd.map(
    lambda x: (x[1], x[2], abs(x[1] - x[2]) / x[1])).toDF(
        ['label', ' prediction', 'rsidual_rate'])
print('predict_value_rate-----------------------------')
predict_value_rate = predict_value_rate.sort("rsidual_rate", ascending=False)

predict_value_rate.write.mode("overwrite").options(
    header="true").csv('/user/limeng/fangtianxia_daxing_predict_result')
r2_dtr = np.zeros(10)
for i in np.arange(10):
    dtr = DecisionTreeRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.)
    dtrModel = dtr.fit(sample)
    prediction_dtr = dtrModel.transform(sample)
    r2_dtr[i] = evaluator.evaluate(prediction_dtr)
plt.plot(np.arange(3, 33, 3), r2_dtr)
# so choose 10 as the maxDepth

# In[108]:

# Random Forest
r2_rfr = np.zeros(10)
for i in np.arange(10):
    rfr = RandomForestRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.)
    rfrModel = rfr.fit(sample)
    prediction_rfr = rfrModel.transform(sample)
    r2_rfr[i] = evaluator.evaluate(prediction_rfr)
plt.plot(np.arange(3, 33, 3), r2_rfr)
# so select 10 as maxDepth

# In[109]:

# Gradient Boosted Trees
r2_gbt = np.zeros(10)
for i in np.arange(10):
    gbt = GBTRegressor(labelCol='mean_temp', maxIter=(i + 1) * 10.)
    gbtModel = gbt.fit(sample)
    prediction_gbt = gbtModel.transform(sample)
    r2_gbt[i] = evaluator.evaluate(prediction_gbt)
plt.plot(np.arange(10, 105, 10), r2_gbt)

# In[160]:


pred_list = []
for i in range(pred_num_period):
    va = VectorAssembler(outputCol='features', inputCols=train_spark_df.columns[:-pred_num_period])
    label_col = 'pred_period_%d'%i
    train_va = va.transform(train_spark_df).select('features', label_col).withColumnRenamed(label_col, 'label').cache()
    val_va = va.transform(val_spark_df).select('features', label_col).withColumnRenamed(label_col, 'label').cache()

    train_va.count(); val_va.count();

    rf = RandomForestRegressor(maxDepth=10, numTrees=10, maxBins=128)
    rfmodel = rf.fit(train_va)

    pred_val = rfmodel.transform(val_va)
    pred_list.append(pred_val.select('prediction').rdd.map(lambda x: x[0]).collect())
    evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName="rmse")
    accuracy = evaluator.evaluate(pred_val)
    print 'RMSE for period %d: %.4f'%(i+1, accuracy)


# In[161]:


pred = np.stack(pred_list, axis=1)

sc.stop()
Beispiel #28
0
final_data.head(1)

# # split train/test

# In[ ]:

train_data, test_data = final_data.randomSplit([0.7, 0.3])

# # Model training

# In[ ]:

from pyspark.ml.regression import RandomForestRegressor
model = RandomForestRegressor(numTrees=100)
model = model.fit(train_data)

# # model evaluation

# In[ ]:

model.featureImportances

# In[ ]:

from pyspark.ml.evaluation import RegressionEvaluator

# In[ ]:

test_results = model.transform(test_data)
sparkConf = SparkConf().setAppName("Yapay Ogrenme").setMaster("local[*]")
sc = SparkContext(conf=sparkConf)
spark = SparkSession.builder.appName("Yapay Ogrenme SQL").getOrCreate()
sc.setLogLevel("ERROR")

df = spark.read.format("csv").option("header", "true").option(
    "inferSchema", "true").csv("realestate.csv")
df.printSchema()
print "--------"

df = df.na.fill(0, df.columns[1:])

dfR = df.drop("transactiondate").withColumnRenamed("logerror", "label")

vecAssembler = VectorAssembler(inputCols=dfR.columns[1:-1],
                               outputCol="features")
dfWithFeatures = vecAssembler.transform(dfR)

(trainingData, testData) = dfWithFeatures.randomSplit([0.7, 0.3])

trainingData.show()

lr = RandomForestRegressor(featuresCol="features",
                           labelCol="label",
                           predictionCol="prediction")
model = lr.fit(trainingData)

predictionsDF = model.transform(testData)

predictionsDF.drop("features").write.option("header", "true").csv("test.csv")
Beispiel #30
0
                            outputCol='Features')
transformedEcommerceData = assembler.transform(ecommerceData)
transformedEcommerceData.show()

# %%
#Preparing the data for the model by only having two columns: the features and the column of known data we're trying to predict
finalData = transformedEcommerceData.select('Features', 'Yearly Amount Spent')
finalData.show()

# %%
#Splitting the data into training and testing sets by randomly choosing 70% of the rows for training and 30% of the rows for testing
trainingData, testingData = finalData.randomSplit([0.7, 0.3])

# %%
#Random Forest Regression
randomForest = RandomForestRegressor(featuresCol="Features",
                                     labelCol="Yearly Amount Spent",
                                     maxDepth=15,
                                     maxBins=32,
                                     numTrees=200)
randomForestModel = randomForest.fit(trainingData)
rfresults = randomForestModel.transform(testingData)
rfresults.select("Prediction", "Yearly Amount Spent", "Features")
rfresults.show()
#Using RMSE to evaluate the model
gbtevaluator = RegressionEvaluator(labelCol="Yearly Amount Spent",
                                   predictionCol="prediction",
                                   metricName="rmse")
gbtrmse = gbtevaluator.evaluate(rfresults)
print("Gradient-Boosted Tree RMSE: ", gbtrmse)
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

random_forest = RandomForestRegressor()

param_Grd = (ParamGridBuilder().addGrid(
    random_forest.maxDepth,
    [2, 4, 6, 8]).addGrid(random_forest.maxBins,
                          [20, 60]).addGrid(random_forest.numTrees,
                                            [5, 20, 50, 100]).build())

t_v_s = TrainValidationSplit(estimator=random_forest,
                             estimatorParamMaps=param_Grd,
                             evaluator=RegressionEvaluator(),
                             trainRatio=0.8)

rfModel = random_forest.fit(trainingData)

# COMMAND ----------

pred = rfModel.transform(trainingData)
select_cols = ["label", "prediction", "time_window"]
pred = pred.select(select_cols)
display(pred)

# COMMAND ----------

import numpy as ny


def Mean_Absolute_Percentage_Error(labl, predction):
    labl, predction = ny.array(labl), ny.array(predction)
Beispiel #32
0
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(
    lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)

from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(numTrees=4,
                           featuresCol="features",
                           labelCol='total_amount',
                           maxDepth=2,
                           seed=42)
rf_model = rf.fit(dataset)
rf_model.write().overwrite().save("./nyc-01020304-6vm-18-RF-model")

import sys
sys.exit(0)
"""
from pyspark.ml.feature import VectorAssembler
#vectorAssembler = VectorAssembler(inputCols = ['key', 'passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'fare_amount')
#newDF_test1=df_test1.withColumn('Travel_Distance',fun_dist_udf(df_test1["pickup_latitude"],df_test1["pickup_longitude"],df_test1["dropoff_latitude"],df_test1["dropoff_longitude"]))
#vectorAssembler = VectorAssembler(inputCols = ['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features')
vectorAssembler = VectorAssembler(inputCols = ['passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(train_X4)
vhouse_df = vhouse_df.select(['features', 'fare_amount'])
vhouse_df.show(3)

from pyspark.ml.regression import RandomForestRegressor
Beispiel #33
0
  stages += [encoder]
#label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label")
#stages += [label_stringIdx]

numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)

from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(numTrees=4,featuresCol="features",labelCol='total_amount', maxDepth=2, seed=42)
rf_model = rf.fit(dataset)
rf_model.write().overwrite().save("./nyc-01020304-6vm-18-RF-model")


import sys
sys.exit(0)
"""
from pyspark.ml.feature import VectorAssembler
#vectorAssembler = VectorAssembler(inputCols = ['key', 'passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'fare_amount')
#newDF_test1=df_test1.withColumn('Travel_Distance',fun_dist_udf(df_test1["pickup_latitude"],df_test1["pickup_longitude"],df_test1["dropoff_latitude"],df_test1["dropoff_longitude"]))
#vectorAssembler = VectorAssembler(inputCols = ['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features')
vectorAssembler = VectorAssembler(inputCols = ['passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(train_X4)
vhouse_df = vhouse_df.select(['features', 'fare_amount'])
vhouse_df.show(3)
# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
dtrModel = dtr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf =  RandomForestRegressor()
print rf.explainParams()
rfModel = rf.fit(df)
gbt = GBTRegressor()
print gbt.explainParams()
gbtModel = gbt.fit(df)


# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity")
pipeline = Pipeline().setStages([glr])
params = ParamGridBuilder().addGrid(glr.regParam, [0, 0.5, 1]).build()
evaluator = RegressionEvaluator()\