Beispiel #1
0
def randomForestRegression(df,arguments):
	from pyspark.ml.regression import RandomForestRegressor
	maxDepth = 5
	minInstancesPerNode = 1
	numTrees = 20
	impurity = "variance"

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.numTrees != None:
		numTrees = float(arguments.numTrees)

	if arguments.impurity != None:
		impurity = arguments.impurity

	rf =  RandomForestRegressor(numTrees=numTrees,
								maxDepth=maxDepth,
								minInstancesPerNode=minInstancesPerNode,
								impurity=impurity)
	model = rf.fit(df)

	return model
 def _getBasePredictor(self, randomSeed):
     f = open(self._baseDataPath, "w")
     f.truncate()
     f.close()
     self._lowTrainData = self._trainData.sample(fraction=self._lowRatio,
                                                 seed=randomSeed).cache()
     self._midTrainData = self._trainData.sample(fraction=self._midRatio,
                                                 seed=randomSeed).cache()
     cs = self.getPCS()
     scenario = Scenario({
         "run_obj": "quality",
         "runcount-limit": self._BPDS,
         "cs": cs,
         "deterministic": "true"
     })
     # Optimize, using a SMAC-object
     smac = SMAC(scenario=scenario,
                 rng=np.random.RandomState(42),
                 tae_runner=self._baseEval)
     smac.optimize()
     df = self._spark.read.format("libsvm").load(self._baseDataPath)
     rf = RandomForestRegressor()
     rfModel = rf.fit(df)
     self._lowTrainData.unpersist()
     self._midTrainData.unpersist()
     return rfModel
Beispiel #3
0
def main():
    # Set bounds for random forest's hyperparameters
    hparams = [(2, 25),   # num_trees
               (2, 6),    # max_depth
               (15, 30)]  # max_bins
    
    # Run hyperparameter optimization using Gaussian processes
    optim_results = gp_minimize(objective, hparams, n_calls=20, verbose=True, random_state=0)
    
    print('\nHyperparameter Optimization Results:')
    print('Best validation RMSE = {}'.format(optim_results.fun))

    # Get best hyperparameters from optimization
    num_trees = optim_results.x[0]
    max_depth = optim_results.x[1]
    max_bins = optim_results.x[2]
    
    # Instantiate a RandomForest model using best hyperparameter settings
    rf = RandomForestRegressor(numTrees=num_trees, maxDepth=max_depth, maxBins=max_bins)

    # Train model. 
    model = rf.fit(train)

    # Make predictions.
    predictions = model.transform(test)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")

    rmse = evaluator.evaluate(predictions)
    
    print('\nFinal Results on Test Set with Optimized Hyperparameters:')
    print("Root Mean Squared Error on test set = %g" % rmse)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description='Pyspark Training')
    parser.add_argument(
        '--data',
        type=str,
        default="../../../data/sample_linear_regression_data.txt",
        help='Data location.')
    args = parser.parse_args()

    data = spark.read.format("libsvm").load(args.data)

    # Split the data into training and test sets (30% held out for testing)
    (train, test) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestRegressor()

    # Train model.  This also runs the indexer.
    model = rf.fit(train)

    # Make predictions.
    predictions = model.transform(test)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")

    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
Beispiel #5
0
def test_multi_target_random_forest():
    import shap
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor

    X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.linnerud(),
                                                   test_size=0.2,
                                                   random_state=0)
    est = RandomForestRegressor(random_state=202,
                                n_estimators=10,
                                max_depth=10)
    est.fit(X_train, Y_train)
    predicted = est.predict(X_test)

    explainer = shap.TreeExplainer(est)
    expected_values = np.asarray(explainer.expected_value)
    assert len(
        expected_values
    ) == est.n_outputs_, "Length of expected_values doesn't match n_outputs_"
    shap_values = np.asarray(explainer.shap_values(X_test)).reshape(
        est.n_outputs_ * X_test.shape[0], X_test.shape[1])
    phi = np.hstack((shap_values, np.repeat(expected_values,
                                            X_test.shape[0]).reshape(-1, 1)))
    assert np.allclose(phi.sum(1), predicted.flatten(order="F"), atol=1e-4)
Beispiel #6
0
def spark_ml():
    diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID'))
    diff_cat_in_train_test.distinct().count()
    
    from pyspark.ml.feature import StringIndexer
    plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID')
    labeller = plan_indexer.fit(train)
    Train1 = labeller.transform(train)
    Test1 = labeller.transform(test)
    Train1.show()
    from pyspark.ml.feature import RFormula
    formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label")
    t1 = formula.fit(Train1)
    train1 = t1.transform(Train1)
    test1 = t1.transform(Test1)
    train1.show()
    train1.select('features').show()
    train1.select('label').show()
    from pyspark.ml.regression import RandomForestRegressor
    rf = RandomForestRegressor()
    (train_cv, test_cv) = train1.randomSplit([0.7, 0.3])
    model1 = rf.fit(train_cv)
    predictions = model1.transform(test_cv)
    from pyspark.ml.evaluation import RegressionEvaluator
    evaluator = RegressionEvaluator()
    mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" })
    import numpy as np
    np.sqrt(mse), mse
    model = rf.fit(train1)
    predictions1 = model.transform(test1)
    df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase')
    df.toPandas().to_csv('submission.csv')
def traintest(data, outputfile):
    print('in traintest  ***********************************')
    print('\n')
    train, test = data.randomSplit([0.7, 0.3])
    train = train.cache()

    test = test.cache()
    #first cross validation to find hyperparameters. hyperparameters with less error
    #learn on all train data to find parameters
    rf_regressor = RandomForestRegressor(featuresCol="features",
                                         labelCol="sale",
                                         seed=40)
    myFeatures = [
        "pack", "bottlessold", "volumesoldl", "volumesoldg", "latitude",
        "longitude"
    ]
    # ,"vendornumber","bottlevolume","statebottlecost"
    assembler = VectorAssembler(inputCols=myFeatures, outputCol="features")
    # "vendornumber","bottlevolume","itemnumber"
    #"statebottlecost",
    pipeline = Pipeline(stages=[assembler, rf_regressor])
    rf = RandomForestRegressor()

    paramGrid = ParamGridBuilder().addGrid(
        rf_regressor.maxDepth,
        [2, 5, 10, 15]).addGrid(rf_regressor.minInfoGain,
                                [0.01]).addGrid(rf_regressor.numTrees,
                                                [20, 30, 100]).build()
    #paramGrid = ParamGridBuilder().addGrid(rf_regressor.maxDepth, [2]).addGrid(rf_regressor.minInfoGain, [0.01]).addGrid(rf_regressor.numTrees, [5]).build()

    # Run cross-validation, and choose the best set of parameters.
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(
                                  predictionCol='prediction',
                                  labelCol='sale',
                                  metricName='rmse'),
                              numFolds=8)  # use 3+ folds in practice
    # Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(train)
    #model = pipeline.fit(train)
    predictions = cvModel.transform(test)

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='sale',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)

    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='sale',
                                         metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)

    print('r2 =', r2)
    print('rmse =', rmse)
    output = open(outputfile, "w")
    output.write("r2=" + str(r2) + "\n")
    output.write("rmse=" + str(rmse) + "\n")
Beispiel #8
0
def model_dev_rf(df_train, df_test, n_trees, max_bins, max_depth):

    rf_start_time = time()

    # Create an Initial Model Instance
    mod_rf = RandomForestRegressor(labelCol='label',
                                   featuresCol='features',
                                   impurity='variance',
                                   featureSubsetStrategy='all',
                                   numTrees=n_trees,
                                   maxBins=max_bins,
                                   maxDepth=max_depth)

    # Training The Model
    rf_final_model = mod_rf.fit(df_train)

    # Scoring The Model On Test Sample
    rf_transformed = rf_final_model.transform(df_test)
    rf_test_results = rf_transformed.select(['prediction', 'label'])

    # Collecting The Model Statistics
    rf_evaluator = RegressionEvaluator(predictionCol="prediction",
                                       labelCol="label")
    rf_r2 = round(
        rf_evaluator.evaluate(rf_test_results,
                              {rf_evaluator.metricName: "r2"}), 3)
    rf_mse = round(
        rf_evaluator.evaluate(rf_test_results,
                              {rf_evaluator.metricName: "mse"}), 3)
    rf_rmse = round(
        rf_evaluator.evaluate(rf_test_results,
                              {rf_evaluator.metricName: "rmse"}), 3)
    rf_mae = round(
        rf_evaluator.evaluate(rf_test_results,
                              {rf_evaluator.metricName: "mae"}), 3)

    # Printing The Model Statitics
    print("\n++++++ Printing Random Forest Model Accuracy ++++++\n")
    print("R Square: " + str(rf_r2 * 100) + "%")
    print("Mean Squared Error: " + str(rf_mse))
    print("Root Mean Squared Error: " + str(rf_rmse))
    print("Mean Absolute Error: " + str(rf_mae))

    rf_end_time = time()
    rf_elapsed_time = (rf_end_time - rf_start_time) / 60
    rf_model_stat = pd.DataFrame({
        "Model Name": ["Random Forest"],
        "R Square": rf_r2,
        "Mean Squared Error": rf_mse,
        "Root Mean Squared Error": rf_rmse,
        "Mean Absolute Error": rf_mae,
        "Time (Min.)": round(rf_elapsed_time, 3)
    })
    rf_output = (rf_final_model, rf_model_stat)

    return (rf_output)
Beispiel #9
0
def RF(trainingData, testData):
    """
        Random Forest Tree Regression Model
        :param trainingData:
        :param testData:
        :param args
        :return: Trained model, predictions, nt (int), md (int)
        """
    nt,md=120,20
    rf = RandomForestRegressor( numTrees=nt, featureSubsetStrategy="auto",\
                               impurity='variance', maxDepth=md, maxBins=100) #120,20
    model = rf.fit(trainingData)
    predictions = model.transform(testData)
    return model, predictions, nt, md
def testRegression(train, test):
    # Train a RandomForest model.
    # Note: Use larger numTrees in practice.

    rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4)

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)
def testRegression(train, test):
    # Train a RandomForest model.
    # Note: Use larger numTrees in practice.

    rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4)

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)
Beispiel #12
0
 def test_OutputNonNumericalGridSearch(self):
     assembler = VectorAssembler(inputCols=self.data.columns[1:(-1)], outputCol="features")
     stratifyCol = "foldID"
     featureAssembledData = assembler.transform(self.data).select("y", "features", stratifyCol)
     rf = RandomForestRegressor(featuresCol="features", 
                            labelCol="y",
                            minInstancesPerNode=1)
     evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="y")
     strategyGrid = ["sqrt", "5"]
     depthGrid = [3, 15]
     paramGrid = ParamGridBuilder()\
            .addGrid(rf.maxDepth, depthGrid)\
            .addGrid(rf.featureSubsetStrategy, strategyGrid)\
            .build()    
     validator = CrossValidatorWithStratificationID(estimator=rf,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator,
                           stratifyCol=stratifyCol)
     cvModel = validator.fit(featureAssembledData)
     metrics = cvModel.avgMetrics.drop("paramSetID")
     collectedMetrics = metrics.collect()        
     roundedMetrics = [localRoundMetricValue(x, "metric for CV rmse") for x in collectedMetrics]
     
     self.assertEqual(len(roundedMetrics), 4, "Incorrect number of returned metric values.")
     expectedMetricStructure = [\
         {'metric for CV rmse': 1.073, 'maxDepth': 3.0, 'featureSubsetStrategy': 'sqrt'},
         {'metric for CV rmse': 1.07, 'maxDepth': 3.0, 'featureSubsetStrategy': '5'},
         {'metric for CV rmse': 1.111, 'maxDepth': 15.0, 'featureSubsetStrategy': 'sqrt'},
         {'metric for CV rmse': 1.108, 'maxDepth': 15.0, 'featureSubsetStrategy': '5'}\
     ]
     for metric in roundedMetrics:
         self.assertTrue(metric in expectedMetricStructure, 
                         "{0} is not expected. The expected {1}.".format(metric, expectedMetricStructure))
Beispiel #13
0
def engineerFeatures():
  sectionCV = CountVectorizer(inputCol='sections', outputCol="sectionVector")
  subsectionCV = CountVectorizer(inputCol='subsections', outputCol="subsectionVector")
  newsdeskCV = CountVectorizer(inputCol='newsdesks', outputCol="newsdeskVector")
  materialCV = CountVectorizer(inputCol='materials', outputCol="materialVector")
  keywordCV = CountVectorizer(inputCol='keywords', outputCol="keywordVector")
  symbolSI = StringIndexer(inputCol="Symbol",outputCol="indexedSymbol", handleInvalid='keep')
  
  va = VectorAssembler(inputCols=['sectionVector',
                                  'subsectionVector',
                                  'newsdeskVector',
                                  'materialVector',
                                  'indexedSymbol',
                                  'keywordVector'], outputCol='features')
  
  articleRfr = RandomForestRegressor(featuresCol="features", labelCol="PriceChange", predictionCol="pPriceChange",
                                     maxBins=5700)
  
  stages = [sectionCV,
            subsectionCV,
            newsdeskCV,
            materialCV,
            keywordCV,
            symbolSI,
            va,
            articleRfr]
  
  return stages
def create_rf_pipeline():
    """Wrapper function that creates a pipeline including a Vector Assembler
    and a Random Forest Regressor.

    Args:
        None

    Returns:
        cols_to_keep (list): a list of the names of feature the model will train on
        pipeline: a pipeline of the feature assembler and the random forest
        param_grid: a grid of parameters for the grid search step

    """
    cols_to_keep = [
        'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
        'dropoff_latitude', 'day_of_week', 'hour_of_day', 'trip_distance',
        'haversine_dist', 'lat_dist', 'long_dist'
    ]

    feature_assembler = VectorAssembler(inputCols=cols_to_keep,
                                        outputCol='features')

    rf = RandomForestRegressor(labelCol='log_min_duration',
                               featuresCol='features')

    pipeline = Pipeline(stages=[feature_assembler, rf])

    param_grid = ParamGridBuilder()\
        .addGrid(rf.numTrees, [20, 50, 100]) \
        .addGrid(rf.maxDepth, [5, 10, 15])\
        .build()

    return cols_to_keep, pipeline, param_grid
Beispiel #15
0
def score_rf(split_input_train_df, split_input_validation_df, model_evaluator):
    global model_rmse, model_dict, model_count

    print(
        "###################### Random Forest Regression #########################"
    )
    rf_regressor = RandomForestRegressor(featuresCol='features',
                                         labelCol='total_delivery_duration')

    print("CrossValidation...")
    rf_paramGrid = ParamGridBuilder()\
    .addGrid(rf_regressor.maxBins, [5700, 6000])\
    .addGrid(rf_regressor.maxMemoryInMB, [256, 512])\
    .addGrid(rf_regressor.subsamplingRate, [0.1, 1.0])\
    .build()
    rf_cross_val = CrossValidator(estimator=rf_regressor,
                                  estimatorParamMaps=rf_paramGrid,
                                  evaluator=model_evaluator,
                                  numFolds=3)
    print("Done")
    print("Fitting training data...")
    rf_cv_model = rf_cross_val.fit(split_input_train_df)
    print("Done")
    print("Evaluating on validation data...")
    rmse = model_evaluator.evaluate(
        rf_cv_model.transform(split_input_validation_df))
    model_rmse.append(rmse)
    model_count += 1
    model_dict[model_count] = {}
    model_dict[model_count]["RF"] = rf_cv_model
    print("RMSE on validation data: %f" % rmse)
Beispiel #16
0
def get_best_weather_model(data):
    train, test = data.randomSplit([0.75, 0.25])
    train = train.cache()
    test = test.cache()

    estimator_gridbuilders = [
        estimator_gridbuilder(RandomForestRegressor(),
                              dict(maxDepth=[5], maxBins=[5], numTrees=[20])),
        estimator_gridbuilder(GBTRegressor(maxIter=100), dict())
    ]
    metricName = 'r2'
    tvs_list = make_weather_trainers(
        .2,  # fraction of data for training
        estimator_gridbuilders,
        metricName)
    ev = tvs_list[0].getEvaluator()
    scorescale = 1 if ev.isLargerBetter() else -1
    model_name_scores = []
    # print(list(tvs_list).count())
    for tvs in tvs_list:
        model = tvs.fit(train)
        test_pred = model.transform(test)
        score = ev.evaluate(test_pred) * scorescale
        model_name_scores.append(
            (model, get_estimator_name(tvs.getEstimator()), score))
    best_model, best_name, best_score = max(model_name_scores,
                                            key=lambda triplet: triplet[2])
    print("Best model is %s with validation data %s score %f" %
          (best_name, ev.getMetricName(), best_score * scorescale))
    return best_model
Beispiel #17
0
def UsefulnessPredictionLDAWithoutCV(trainingdata, model):
    # Data Preprocessing
    tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word")
    remover = StopWordsRemover(inputCol="tokens_word",
                               outputCol="filtered_tokens_word")
    cv = CountVectorizer(inputCol="filtered_tokens_word",
                         outputCol="raw_features",
                         minDF=2.0,
                         vocabSize=250)
    idf = IDF(inputCol="raw_features", outputCol="features")

    # Extract LDA topic feature
    lda = LDA(k=30, maxIter=10)
    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="topicDistribution")

    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model])
    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")

    cvModel = pipeline.fit(trainingdata)

    # Explain params for the selected model
    print cvModel.explainParams()
    return cvModel
Beispiel #18
0
def main(inputs, model_file):
    data = spark.read.csv(inputs, schema=tmax_schema)
    data.registerTempTable('yesterday')
    #wthr_query = """SELECT  dayofyear(date) as dayofyr, latitude, longitude, elevation,tmax  FROM __THIS__"""
    wthr_query = """SELECT dayofyear(today.date) as dayofyr,today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax as yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"""
    
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    #define the assembler and regressor
    assembler = VectorAssembler(inputCols=["latitude", "longitude", "elevation", "dayofyr" ], outputCol="features")
    regressor = RandomForestRegressor(maxDepth=10, minInstancesPerNode=2, minInfoGain=0.5, labelCol = "tmax")
    trans_query = SQLTransformer(statement = wthr_query)
    
    #define pipeline and model
    wthr_pipeline = Pipeline(stages=[trans_query, assembler, regressor])
    wthr_model = wthr_pipeline.fit(train)
 
    #define the regression evaluator
    evaluator = RegressionEvaluator(labelCol="tmax", predictionCol="prediction")
    predictions = wthr_model.transform(validation)
    err = evaluator.evaluate(predictions)
    wthr_model.write().overwrite().save(model_file)
    print('Root Mean Square Error(rmse) : ' + str(err))
    def rf_train(self, data, stages):
        """  Random forest training using Grid Search CV
        """
        rf = RandomForestRegressor(featuresCol='features',
                                   labelCol="submission_ratio")
        stages.append(rf)
        pipeline = Pipeline(stages=stages)

        paramGrid = ParamGridBuilder() \
            .addGrid(rf.numTrees, [int(x) for x in np.linspace(start=10, stop=50, num=3)]) \
            .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start=5, stop=25, num=3)]) \
            .build()

        self.evaluator = RegressionEvaluator(
            predictionCol='prediction',
            labelCol='submission_ratio',
            metricName='rmse',
        )
        cross_val = CrossValidator(estimator=pipeline,
                                   estimatorParamMaps=paramGrid,
                                   evaluator=self.evaluator,
                                   numFolds=3)
        self.model = cross_val.fit(data)
        pip_model = self.model.bestModel
        pip_model.save("../data/model")
Beispiel #20
0
def main():

    # 1. Configure Spark
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    text_file = sc.textFile("s3a://spotifybuck/albumfeatures/2017/*/*/*/*/*")

    #3. Transform data
    af = (text_file.map(getVals))

    #4. Create a DataFrame out of this using the toDF method and cache it
    afdf = af.toDF([
        'acousticness', 'danceability', 'energy', 'instrumentalness',
        'liveness', 'loudness', 'duration'
    ]).cache()

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(afdf)

    #5. Create a train/test split with 70% of data in training set and 30% of data in test set
    afdf_train, afdf_test = afdf.randomSplit([0.7, 0.3], seed=123)

    # Train a RandomForest model.
    rf = RandomForestRegressor(featuresCol="indexedFeatures")

    # Chain indexer and forest in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, rf])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(afdf_train)

    # Make predictions.
    predictions = model.transform(afdf_test)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    rfModel = model.stages[1]
    print(rfModel)  # summary only

    #Step 3: Building our Pipelines

    rfModel.save('s3a://spotifybuck/model-export' +
                 datetime.now().strftime('%Y%m%d%H%M'))
    pipeline.save('s3a://spotifybuck/pipeline-export' +
                  datetime.now().strftime('%Y%m%d%H%M'))

    sc.stop()
Beispiel #21
0
    def feature_imp_pyspark(self):
        num_var = [i[0] for i in self.data_frame.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!=self.target)]
        num_var = [col for col in num_var if not col.endswith('indexed')]
        # labels_count = [len(self.data_frame.select(col).distinct().collect()) for col in num_var]
        labels_count = [len(self.data_frame.agg((F.collect_set(col).alias(col))).first().asDict()[col]) for col in num_var]
        labels_count.sort()
        max_count =  labels_count[-1]
        #one_hot = [col for col in self.data_frame.columns if col.endswith('_indexed_encoded')]
        #num_var.extend(one_hot)
        label_indexes = StringIndexer(inputCol = self.target , outputCol = 'label', handleInvalid = 'keep')
        assembler = VectorAssembler(inputCols = num_var , outputCol = "features")
        if self.problem_type == 'REGRESSION':
            model = RandomForestRegressor(labelCol="label", \
                                     featuresCol="features", seed = 8464,\
                                     numTrees=10, cacheNodeIds = True,\
                                     subsamplingRate = 0.7)
        else:
            model = RandomForestClassifier(labelCol="label", \
                                     featuresCol="features", seed = 8464,\
                                     numTrees=10, cacheNodeIds = True,\
                                     subsamplingRate = 0.7,maxBins = max_count+2)
        pipe = Pipeline(stages =[assembler, label_indexes, model])

        mod_fit = pipe.fit(self.data_frame)
        df2 = mod_fit.transform(self.data_frame)
        cols = MLUtils.ExtractFeatureImp(mod_fit.stages[-1].featureImportances, df2, "features")
        cols_considered = cols.loc[cols['score'] > 0]
        cols_considered = list(cols_considered['name'])
        #tree_fs = list(set(cols_considered) & set(self.data_frame.columns))
        #tree_fs.extend(list(set([encoded for encoded in one_hot for column in cols_considered if column.startswith(encoded)])))
        self.data_change_dict['SelectedColsTree'] = cols_considered
        if self.target not in cols_considered:
            cols_considered.append(self.target)
        return cols_considered
def randomForestRun(train, test, featureIndexer, zillow_test, test_cols):
    print("Training Data Table")
    train.show()
    print("Training...")
    rf = RandomForestRegressor(featuresCol="indexedFeatures")
    pipe = Pipeline(stages=[featureIndexer, rf])
    model = pipe.fit(train)
    print("Training... Done")
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("Random Forest Prediction")
    print("R-squared on test data = %g" % r2)
    evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
    mae = evaluator.evaluate(predictions)
    print("Mean Absolute Error on test data = %g" % mae)
    print("Features Importances")
    k = model.stages[-1].featureImportances
    print(k)

    print("Predicted Price by Zillow")
    zillow_test.show()
    zillow_rdd = zillow_test.rdd
    input_ = zillow_rdd.map(lambda line: (line[0], line[1], line[2], line[3], line[4], line[5], Vectors.dense(line[0:-1])))
    zillow_test = spark.createDataFrame(input_, test_cols + ["features"])
    pred_zillow = model.transform(zillow_test)
    pred_zillow = pred_zillow.withColumn('prediction price/ night', exp(pred_zillow.prediction))
    pred_zillow = pred_zillow.withColumn('prediction price/ month', 30* exp(pred_zillow.prediction))
    pred_zillow.show()
    return pred_zillow
Beispiel #23
0
def UsefulnessPredictionLDA(trainingdata, model):
    # Data Preprocessing
    tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word")

    remover = StopWordsRemover(inputCol="tokens_word",
                               outputCol="filtered_tokens_word")
    cv = CountVectorizer(inputCol="filtered_tokens_word",
                         outputCol="raw_features",
                         minDF=2.0)
    idf = IDF(inputCol="raw_features", outputCol="features")

    # Extract LDA topic feature
    lda = LDA(k=30, maxIter=10)
    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="topicDistribution")
    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model])
    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")
    paramGrid = ParamGridBuilder() \
        .addGrid(cv.vocabSize, [150, 200, 250]) \
        .build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator_rmse,
                              numFolds=4)  # use 3+ folds in practice
    cvModel = crossval.fit(trainingdata)
    # Explain params for the selected model
    print cvModel.explainParams()
    return cvModel
Beispiel #24
0
def engineerFeatures():
    actionSI = StringIndexer(inputCol="Action",
                             outputCol="indexedAction",
                             handleInvalid='keep')
    fromSI = StringIndexer(inputCol="From",
                           outputCol="indexedFrom",
                           handleInvalid='keep')
    toSI = StringIndexer(inputCol="To",
                         outputCol="indexedTo",
                         handleInvalid='keep')
    firmSI = StringIndexer(inputCol="Research Firm",
                           outputCol="indexedFirm",
                           handleInvalid='keep')
    symbolSI = StringIndexer(inputCol="Symbol",
                             outputCol="indexedSymbol",
                             handleInvalid='keep')
    va = VectorAssembler(inputCols=[
        'indexedAction', 'indexedFrom', 'indexedTo', 'indexedFirm',
        'indexedSymbol'
    ],
                         outputCol='features')
    analystRfr = RandomForestRegressor(featuresCol="features",
                                       labelCol="PriceChange",
                                       predictionCol="pPriceChange",
                                       maxBins=5700)

    stages = [actionSI, fromSI, toSI, firmSI, symbolSI, va, analystRfr]

    return stages
Beispiel #25
0
def main(model_file):

    keyspace='technoaces'

    data = spark.read.format("org.apache.spark.sql.cassandra")\
                .options(table='imdb_movies_data', keyspace=keyspace).load()
    data = data.where(data['imdb_score']!=0).where(data['runtimemins']!=0).where(data['meta_score']!=0).where(data['votes']!=0)
    train, validation = data.randomSplit([0.75, 0.25])

    imdb_assembler = VectorAssembler(
        inputCols=['year','runtimemins','meta_score', 'votes'],
        outputCol='features')
    
    imdbclassifier = RandomForestRegressor(
        numTrees=2,featuresCol='features',
        labelCol='imdb_score',maxDepth=30,seed=1000)

    pipeline = Pipeline(stages=[imdb_assembler, imdbclassifier])
    
    model = pipeline.fit(train)
    predictions = model.transform(validation)

    predictions.select('imdb_id','title','runtimemins','meta_score', 'imdb_score','votes'\
                ,predictions['prediction'].alias('Predicted Votes')).show()
    
    r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='imdb_score',
            metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)

    return 0
Beispiel #26
0
def test_boston_dataset(spark_session: SparkSession):
    boston = load_boston()
    feature_names = boston.feature_names.tolist()
    output_name = 'outcome'
    boston_columns = feature_names + [output_name]
    X = boston.data.tolist()
    y = boston.target.tolist()
    Xy = [(i + [j]) for (i, j) in zip(X, y)]
    boston_df = spark_session.createDataFrame(Xy, boston_columns)
    print(feature_names)
    must_include_features = []
    # must_include_features = ['TAX', 'INDUS']

    # %% Ranking features
    ranked_features = feature_ranker(
        df=boston_df,
        feature_columns=feature_names,
        output_column=output_name,
        must_include_features=must_include_features)
    print(ranked_features)
    # %% Feature selection
    scores = feature_selector(df=boston_df,
                              ranked_features=ranked_features,
                              output_column=output_name,
                              estimator_obj=RandomForestRegressor(),
                              feature_inclusion_increments=1,
                              train_test_split_ratio=[0.66, 0.33],
                              cv=-1,
                              evaluation_metric='r2')
    print(scores)
Beispiel #27
0
def Distr_RandomForestRegressor(xy_train, xy_test):
    rf = RandomForestRegressor(minInstancesPerNode=20, maxDepth=25)
    evalu = RegressionEvaluator()
    grid_1 = ParamGridBuilder()\
            .addGrid(rf.numTrees, [100])\
            .addGrid(rf.featureSubsetStrategy, ['0.5','0.8','1.0'])\
            .build()
    cv_1 = CrossValidator(estimator=rf,
                          estimatorParamMaps=grid_1,
                          evaluator=evalu,
                          numFolds=5)
    #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型
    cvModel_1 = cv_1.fit(xy_train)
    print "Grid scores: "
    best_params_1 = Get_best_params(cvModel_1, 'reg')['featureSubsetStrategy']
    grid = ParamGridBuilder()\
            .addGrid(rf.numTrees, [300,500])\
            .addGrid(rf.featureSubsetStrategy, [best_params_1,])\
            .build()
    cv = CrossValidator(estimator=rf,
                        estimatorParamMaps=grid,
                        evaluator=evalu,
                        numFolds=5)
    #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型
    cvModel = cv.fit(xy_train)
    best_params = Get_best_params(cvModel, 'reg')

    print "Best parameters set found: %s" % best_params

    return cvModel.bestModel
Beispiel #28
0
def UsefulnessPredictionSentmentWithoutCV(trainingdata, model):
    # Data Preprocessing
    assembler = VectorAssembler(inputCols=[
        'num', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos',
        'sentiment_compound', 'Character_adj', 'Character_noun',
        'Character_verb', 'Character_adv'
    ],
                                outputCol="features")

    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexedFeatures",
                                   maxCategories=4)

    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="indexedFeatures")

    pipeline = Pipeline(stages=[assembler, featureIndexer, model])

    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")

    Model = pipeline.fit(trainingdata)

    return Model
Beispiel #29
0
    def test_random_forrest_regression(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register(
            "truncateFeatures",
            lambda x: SparseVector(feature_count, range(0, feature_count),
                                   x.toArray()[125:130]), VectorUDT())
        data = original_data.selectExpr(
            "cast(label as string) as label",
            "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label",
                                      outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features",
                                        outputCol="indexedFeatures",
                                        maxCategories=10,
                                        handleInvalid='error')

        rf = RandomForestRegressor(labelCol="indexedLabel",
                                   featuresCol="indexedFeatures",
                                   numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(
            model,
            'Sparkml RandomForest Regressor',
            [('label', StringTensorType([1, 1])),
             ('features', FloatTensorType([1, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data.limit(1))
        data_np = {
            'label':
            data.limit(1).toPandas().label.values,
            'features':
            data.limit(1).toPandas().features.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlRandomForestRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'],
                                               data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Beispiel #30
0
 def model_define(self):
     """Returns a model with the hyperparameters inputted in :func:
     `get_parameters`.
     Returns:
         (pyspark.ml.regression.RandomForestRegressor)
             Random Forest Regression model
     """
     return RandomForestRegressor()
Beispiel #31
0
def main(inputs, out_model):
    data = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    query = "SELECT dayofyear(today.date) as doy, today.latitude, today.longitude, today.elevation,today.tmax,yesterday.tmax AS yesterday_tmax \
    FROM __THIS__ as today \
    INNER JOIN __THIS__ as yesterday \
    ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"

    #query ="SELECT station,date, dayofyear(date) as doy, latitude, longitude, elevation,tmax  FROM __THIS__"
    getDOY = SQLTransformer(statement=query)

    feature_cols = ['latitude', 'longitude', 'elevation', 'doy']
    column_names = dict(featuresCol="features",
                        labelCol="tmax",
                        predictionCol="prediction")

    feature_assembler = VectorAssembler(inputCols=feature_cols,
                                        outputCol=column_names["featuresCol"])

    # Testing different models to fit the best one!!!
    #est=GBTRegressor(maxIter=400,maxDepth=20)
    est = RandomForestRegressor(featureSubsetStrategy="log2",
                                minInfoGain=0.5,
                                numTrees=40)
    #est=DecisionTreeRegressor(maxDepth=10,minInstancesPerNode=4,minInfoGain=0.5)
    est = est.setParams(**column_names)
    pl = Pipeline(stages=[getDOY, feature_assembler, est])
    model = pl.fit(train)

    predictions = model.transform(validation)
    predictions.show()

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    print('\n\nr2=', r2)
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='tmax',
                                         metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)
    print('\n\nrmse=', rmse)
    model.write().overwrite().save(out_model)
    def _train_model_spark(self, data):
        df = self._prepare_data_spark(data)
        input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                                self.TODAY_PRICE}))

        if self.ann_hidden_nodes_num is None:
            self.ann_hidden_nodes_num = input_num / 2 + 1
        ann_layers = [input_num,
                      # input_num / 3 * 2,
                      # input_num / 3,
                      self.ann_hidden_nodes_num,
                      2]

        self.logger.info('layer settings are {}'.format(ann_layers))
        self.logger.info('training method is {}'.format(self._train_method))
        self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
        if isinstance(self._train_method, dict):
            if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                self._model[self.CHANGE_AMOUNT].stop_server()
            self._model = {self.CHANGE_AMOUNT: None,
                           self.CHANGE_DIRECTION: None}

            if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                      maxIter=self.linear_regression_training_times,
                                      regParam=self.linear_regression_regularization_parameter,
                                      predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = lr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth,
                                            predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                                          num_workers=self.spark_worker_numbers,
                                                                          epoch=self.ann_epoch_number,
                                                                          featuresCol="features",
                                                                          labelCol=self.CHANGE_AMOUNT,
                                                                          predictionCol='AmountPrediction'
                                                                          )
                self._model[self.CHANGE_AMOUNT].fit(df)
            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

            if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
                lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                        maxIter=self.logistic_regression_training_times,
                                        regParam=self.linear_regression_regularization_parameter,
                                        predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = lr.fit(df)
            elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
                rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                             numTrees=self.random_forest_tree_number,
                                             maxDepth=self.random_forest_tree_max_depth,
                                             predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = rfc.fit(df)

            elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 2
                mlpc = MultilayerPerceptronClassifier(featuresCol="features",
                                                      labelCol=self.CHANGE_DIRECTION,
                                                      layers=ann_layers,
                                                      predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        else:
            if self._train_method == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
                                      regParam=self.linear_regression_regularization_parameter,
                                      maxIter=self.linear_regression_training_times)
                self._model = lr.fit(df)
            elif self._train_method == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
                                            predictionCol='prediction',
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth)
                self._model = rfr.fit(df)

            elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                if self._model is not None:
                    self._model.stop_server()
                self.logger.warn('layers are {}'.format(ann_layers))
                self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                      num_workers=self.spark_worker_numbers, epoch=100,
                                                      featuresCol="features", labelCol=self.TARGET_PRICE,
                                                      predictionCol='prediction'
                                                      )
                self._model.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        return self._model
Beispiel #33
0
  encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec")
  stages += [encoder]
#label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label")
#stages += [label_stringIdx]

numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)

from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(numTrees=4,featuresCol="features",labelCol='total_amount', maxDepth=2, seed=42)
rf_model = rf.fit(dataset)
rf_model.write().overwrite().save("./nyc-01020304-6vm-18-RF-model")


import sys
sys.exit(0)
"""
from pyspark.ml.feature import VectorAssembler
#vectorAssembler = VectorAssembler(inputCols = ['key', 'passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'fare_amount')
#newDF_test1=df_test1.withColumn('Travel_Distance',fun_dist_udf(df_test1["pickup_latitude"],df_test1["pickup_longitude"],df_test1["dropoff_latitude"],df_test1["dropoff_longitude"]))
#vectorAssembler = VectorAssembler(inputCols = ['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features')
vectorAssembler = VectorAssembler(inputCols = ['passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(train_X4)
vhouse_df = vhouse_df.select(['features', 'fare_amount'])
vhouse_df.show(3)
glrModel = glr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
dtrModel = dtr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf =  RandomForestRegressor()
print rf.explainParams()
rfModel = rf.fit(df)
gbt = GBTRegressor()
print gbt.explainParams()
gbtModel = gbt.fit(df)


# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity")
pipeline = Pipeline().setStages([glr])