Ejemplo n.º 1
0
def runGBT(busDF='', usrDF='', revDF='', fraction=[0.7, 0.3]):
    print 'GBTRegressor:'
    print '  Rating Prediction:'
    gbt = GBTRegressor(maxIter=1, maxDepth=1, seed=42)
    businessDF, userDF, starDF = traingbt.dataClean(busDF=busDF,
                                                    usrDF=usrDF,
                                                    revDF=revDF)
    # split starDF to training data and test data
    trainStarDF, testStarDF = starDF.randomSplit(fraction)

    trainDF = traingbt.transData4GBT(businessDF, userDF, trainStarDF)

    model = gbt.fit(trainDF)

    testDF = traingbt.transData4GBT(businessDF, userDF, testStarDF)
    predDF = model.transform(testDF)

    predDF.show()
    errors = predDF.rdd.map(lambda x: (x.label - x.prediction)**2).collect()
    RMSE = math.sqrt(sum(errors) / len(errors))
    print '    GBTRegressor RMSE: %.8f' % RMSE

    print '  Recommendation:'
    # recDF = traingbt.recommendation(businessDF, testStarDF, testDF, model)
    # recDF.printSchema()
    print '    Recommendation RMSE: %.8f' % RMSE
Ejemplo n.º 2
0
def gbdtRegression(df,arguments):
	from pyspark.ml.regression import GBTRegressor
	numTrees = 20
	stepSize = 0.1
	maxDepth = 5
	minInstancesPerNode = 1

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.numTrees != None:
		numTrees = float(arguments.numTrees)

	if arguments.stepSize != None:
		stepSize = float(arguments.stepSize)

	if arguments.impurity != None:
		impurity = arguments.impurity

	gbdt = GBTRegressor(maxIter=numTrees,
						stepSize=stepSize,
						maxDepth=maxDepth,
						minInstancesPerNode=minInstancesPerNode)
	model = gbdt.fit(df)

	return model
Ejemplo n.º 3
0
 def test_gbt_regressor(self):
     data = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                        (0.0, Vectors.sparse(1, [], []))],
                                       ["label", "features"])
     gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml GBTRegressor',
         [('features', FloatTensorType([1, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlGBTRegressor")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Ejemplo n.º 4
0
    def fit(self):
        """
        Creates the pipeline, splits the data , fits the model and save the model, also evaluates the results
        :return:
        """
        cols = [
            x for x in self.data.columns
            if x not in ['datetime', 'label', 'speed_overground']
        ]
        assembler = VectorAssembler(handleInvalid="keep").setInputCols \
            (cols).setOutputCol("features")

        print('assembler')
        train = assembler.transform(self.data)
        train = train.drop(*cols)
        gbt = GBTRegressor(labelCol="speed_overground",
                           featuresCol="features",
                           predictionCol='predictions')

        print('Train model.  This also runs the indexers.')
        model = gbt.fit(train)
        # Save and load model
        model.write().overwrite().save('myGBTRegressor_nan')
        predictions = model.transform(train)
        evaluator = RegressionEvaluator(labelCol="speed_overground",
                                        predictionCol="predictions",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
Ejemplo n.º 5
0
def run_GBT_Regression(dependent_variable, data_vector):
    train_df, test_df = create_train_test_df(dependent_variable, data_vector)

    regressor = GBTRegressor(featuresCol='features',
                             labelCol=dependent_variable,
                             maxIter=10)
    regressor_model = regressor.fit(train_df)
    print('feature importance is {}'.format(
        regressor_model.featureImportances))

    # Make predictions.
    predictions = regressor_model.transform(test_df)

    # Select example rows to display.
    predictions.select("prediction", dependent_variable, "features").show()

    # Select (prediction, true label) and compute test error
    r2_evaluator = RegressionEvaluator(labelCol=dependent_variable,
                                       predictionCol="prediction",
                                       metricName="r2")
    rmse_evaluator = RegressionEvaluator(labelCol=dependent_variable,
                                         predictionCol="prediction",
                                         metricName="rmse")

    print("R Squared (R2) on test data = %g" %
          r2_evaluator.evaluate(predictions))
    print("Root Mean Squared Error (RMSE) on test data = %g" %
          rmse_evaluator.evaluate(predictions))

    return [
        r2_evaluator.evaluate(predictions),
        rmse_evaluator.evaluate(predictions),
        regressor_model.featureImportances.toArray().tolist()
    ]
    def gradientBoostRegressorModel(self):
        gradientBoostRegressorModelFit = \
            GBTRegressor(labelCol=self.labelColm,
                         featuresCol=self.featuresColm,
                         predictionCol=self.modelSheetName)
        regressor = gradientBoostRegressorModelFit.fit(self.trainData)
        # predictionData = regressor.transform(self.testData)

        regressionStat = self.randomGradientRegressionModelEvaluation(regressor=regressor)

        # persisting model
        modelName = "gradientBoostModel"
        extention = ".parquet"
        modelStorageLocation = self.locationAddress + self.userId.upper() + modelName.upper() + extention
        regressor.write().overwrite().save(modelStorageLocation)

        regressionStat["modelPersistLocation"] = {"modelName": modelName,
                                                  "modelStorageLocation": modelStorageLocation}

        return regressionStat



        # reference for the future development.
        """
Ejemplo n.º 7
0
def build_gradient_boosted_tree_regression(observation_df, feature_columns):
    # Create new column with all of the features
    vector_observation_df = create_feature_column(observation_df,
                                                  feature_columns,
                                                  ['features', 'duration_sec'])

    train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3])
    model = GBTRegressor(featuresCol="features",
                         labelCol="duration_sec",
                         maxIter=15)

    model = model.fit(train_df)

    test_predictions = model.transform(test_df)

    test_predictions.select("prediction", "duration_sec", "features").show(5)

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="rmse")
    print("RMSE on test data = %g" % evaluator.evaluate(test_predictions))

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="r2")

    print("R2 on test data = %g" % evaluator.evaluate(test_predictions))

    return model
Ejemplo n.º 8
0
def model_dev_gbm(df_train, df_test, max_iter, max_bins, max_depth):

    gbm_start_time = time()

    # Create an Initial Model Instance
    mod_gbm = GBTRegressor(labelCol='label',
                           featuresCol='features',
                           featureSubsetStrategy='all',
                           lossType='squared',
                           maxIter=max_iter,
                           maxBins=max_bins,
                           maxDepth=max_depth)

    # Training The Model
    gbm_final_model = mod_gbm.fit(df_train)

    # Scoring The Model On Test Sample
    gbm_transformed = gbm_final_model.transform(df_test)
    gbm_test_results = gbm_transformed.select(['prediction', 'label'])

    # Collecting The Model Statistics
    gbm_evaluator = RegressionEvaluator(predictionCol="prediction",
                                        labelCol="label")
    gbm_r2 = round(
        gbm_evaluator.evaluate(gbm_test_results,
                               {gbm_evaluator.metricName: "r2"}), 3)
    gbm_mse = round(
        gbm_evaluator.evaluate(gbm_test_results,
                               {gbm_evaluator.metricName: "mse"}), 3)
    gbm_rmse = round(
        gbm_evaluator.evaluate(gbm_test_results,
                               {gbm_evaluator.metricName: "rmse"}), 3)
    gbm_mae = round(
        gbm_evaluator.evaluate(gbm_test_results,
                               {gbm_evaluator.metricName: "mae"}), 3)

    # Printing The Model Statitics
    print("\n++++++ Printing Gradient Boosting Model Accuracy ++++++\n")
    print("R Square: " + str(gbm_r2 * 100) + "%")
    print("Mean Squared Error: " + str(gbm_mse))
    print("Root Mean Squared Error: " + str(gbm_rmse))
    print("Mean Absolute Error: " + str(gbm_mae))

    gbm_end_time = time()
    gbm_elapsed_time = (gbm_end_time - gbm_start_time) / 60
    gbm_model_stat = pd.DataFrame({
        "Model Name": ["Gradient Boosting"],
        "R Square": gbm_r2,
        "Mean Squared Error": gbm_mse,
        "Root Mean Squared Error": gbm_rmse,
        "Mean Absolute Error": gbm_mae,
        "Time (Min.)": round(gbm_elapsed_time, 3)
    })
    gbm_output = (gbm_final_model, gbm_model_stat)

    return (gbm_output)
Ejemplo n.º 9
0
def GBT(trainingData, testData):
    """
     Gradient Boosted Tree Regression Model
    :param trainingData:
    :param testData:
    :return: Trained model, predictions
    """
    gbt = GBTRegressor(maxIter=100, maxDepth=6, seed=42)
    model = gbt.fit(trainingData)
    predictions = model.transform(testData)
    return model, predictions
Ejemplo n.º 10
0
def traingbt(datafrom='json', business_path='', user_path='', star_path=''):
    gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
    if datafrom == 'json':
        businessDF, userDF, starDF = loadDataJson(business_path=business_path,
                                                  user_path=user_path,
                                                  star_path=star_path)
    elif datafrom == 'mongodb':
        businessDF, userDF, starDF = loadDataMongo()
    data = transData4GBT(businessDF, userDF, starDF)
    model = gbt.fit(data)
    return model
def gradient_boosted_tree_regression(train_data, test_data):
    gbt = GBTRegressor(featuresCol='features', labelCol='MEDV', maxIter=10)
    gbt_model = gbt.fit(train_data)
    gbt_predictions = gbt_model.transform(test_data)
    print(gbt_predictions.select('prediction', 'MEDV', 'features').show(5))
    gbt_evaluator = RegressionEvaluator(
        labelCol='MEDV',
        predictionCol='prediction',
        metricName='rmse',
    )
    rmse = gbt_evaluator.evaluate(gbt_predictions)
    print('Root Mean Squared Error (RMSE) on test data = %g' % rmse)
Ejemplo n.º 12
0
def testRegression(train, test):
    # Train a GradientBoostedTrees model.

    rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)
def testRegression(train, test):
    # Train a GradientBoostedTrees model.

    rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)
Ejemplo n.º 14
0
def params_of_GBTRegressor():
    """
    def param_grid_gbtr(esti: Estimator) -> list:
        return ParamGridBuilder() \
            .addGrid(esti.maxBins, [16, 32, 64]) \
            .addGrid(esti.maxDepth, [3, 5, 10]) \
            .build()

    
{
 Param(parent='GBTRegressor_a1b083f1027e', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 32,
    (esti.maxBins, [01, 32])
 Param(parent='GBTRegressor_a1b083f1027e', name='lossType', doc='Loss function which GBT tries to minimize (case-insensitive). Supported options: squared, absolute'): 'squared',
    (esti.lossType, ['absolute', 'squared'])
 Param(parent='GBTRegressor_a1b083f1027e', name='featureSubsetStrategy', 
    (esti.featureSubsetStrategy ,['auto', 'all'])

    doc="The number of features to consider for splits at each tree node. Supported 
    options: 
        'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), 
        'sqrt' for classification and 
        'onethird' for regression), 
        'all' (use all features), 
        'onethird' (use 1/3 of the features), 
        'sqrt' (use sqrt(number of features)), 
        'log2' (use log2(number of features)), 
        'n' (when n is in the range (0, 1.0], use n *   number of features. When n is in the range (1, number of features), use n features). default = 'auto'"): 'all',
 Param(parent='GBTRegressor_a1b083f1027e', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1,
 Param(parent='GBTRegressor_a1b083f1027e', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0,
 Param(parent='GBTRegressor_a1b083f1027e', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 5,
    (esti.maxDepth, [3, 5, 8])
 Param(parent='GBTRegressor_a1b083f1027e', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,
 Param(parent='GBTRegressor_a1b083f1027e', name='cacheNodeIds', d
    oc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. C
    aching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False}

 Param(parent='GBTRegressor_a1b083f1027e', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.1,
 Param(parent='GBTRegressor_a1b083f1027e', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,
 Param(parent='GBTRegressor_a1b083f1027e', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,
 Param(parent='GBTRegressor_a1b083f1027e', name='maxIter', doc='max number of iterations (>= 0).'): 20,

    """
    SparkSession.builder \
        .appName("tryout") \
        .getOrCreate()
    m = GBTRegressor()
    pm = m.extractParamMap()
    pprint(pm)
Ejemplo n.º 15
0
 def train_boosted_regression(self,
                              depth=2,
                              n_trees=50,
                              learning_rate=.01,
                              max_cats=6):
     '''
     train dataset on boosted decision trees
     --------
     Parameters
     depth: int -  max_allowable depth of decision tree leafs
     n_trees: int - max number of iterations
     learning_rate: int - rate which the model fits
     --------
     '''
     featureIndexer = \
     VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=max_cats).fit(self.train)
     gbr = GBTRegressor(labelCol='label',
                        featuresCol="features",
                        maxDepth=depth,
                        maxIter=n_trees,
                        stepSize=learning_rate,
                        maxMemoryInMB=2000)
     pipeline = Pipeline(stages=[featureIndexer, gbr])
     # Train model.  This also runs the indexer.
     self.model = pipeline.fit(self.train)
Ejemplo n.º 16
0
def score_gbt(split_input_train_df, split_input_validation_df,
              model_evaluator):
    global model_rmse, model_dict, model_count

    print(
        "###################### Gradient Boosted Tree Regression #########################"
    )
    gbt_regressor = GBTRegressor(featuresCol='features',
                                 labelCol='total_delivery_duration')

    print("CrossValidation...")
    gbt_paramGrid = ParamGridBuilder()\
    .addGrid(gbt_regressor.maxIter, [5, 10])\
    .addGrid(gbt_regressor.maxBins, [5700, 6000])\
    .addGrid(gbt_regressor.maxMemoryInMB, [256, 512])\
    .addGrid(gbt_regressor.subsamplingRate, [0.1, 1.0])\
    .build()
    gbt_cross_val = CrossValidator(estimator=gbt_regressor,
                                   estimatorParamMaps=gbt_paramGrid,
                                   evaluator=model_evaluator,
                                   numFolds=3)
    print("Done")
    print("Fitting training data...")
    gbt_cv_model = gbt_cross_val.fit(split_input_train_df)
    print("Done")
    print("Evaluating on validation data...")
    rmse = model_evaluator.evaluate(
        gbt_cv_model.transform(split_input_validation_df))
    model_rmse.append(rmse)
    model_count += 1
    model_dict[model_count] = {}
    model_dict[model_count]["GBT"] = gbt_cv_model
    print("RMSE on validation data: %f" % rmse)
Ejemplo n.º 17
0
def get_best_weather_model(data):
    train, test = data.randomSplit([0.75, 0.25])
    train = train.cache()
    test = test.cache()

    estimator_gridbuilders = [
        estimator_gridbuilder(RandomForestRegressor(),
                              dict(maxDepth=[5], maxBins=[5], numTrees=[20])),
        estimator_gridbuilder(GBTRegressor(maxIter=100), dict())
    ]
    metricName = 'r2'
    tvs_list = make_weather_trainers(
        .2,  # fraction of data for training
        estimator_gridbuilders,
        metricName)
    ev = tvs_list[0].getEvaluator()
    scorescale = 1 if ev.isLargerBetter() else -1
    model_name_scores = []
    # print(list(tvs_list).count())
    for tvs in tvs_list:
        model = tvs.fit(train)
        test_pred = model.transform(test)
        score = ev.evaluate(test_pred) * scorescale
        model_name_scores.append(
            (model, get_estimator_name(tvs.getEstimator()), score))
    best_model, best_name, best_score = max(model_name_scores,
                                            key=lambda triplet: triplet[2])
    print("Best model is %s with validation data %s score %f" %
          (best_name, ev.getMetricName(), best_score * scorescale))
    return best_model
def train_model(model_file, inputs): 
    # get the data
    train_tmax = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = train_tmax.randomSplit([0.75, 0.25], seed=110)
   
    #query ="SELECT station,date, dayofyear(date) as doy, latitude, longitude, elevation,tmax  FROM __THIS__"
    
    query = """SELECT today.station, dayofyear(today.date) as doy, today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"""
    
    #weather_assembler = VectorAssembler(inputCols=['latitude','longitude','elevation', 'doy'], outputCol="features")
    weather_assembler = VectorAssembler(inputCols=['latitude','longitude','elevation', 'doy', 'yesterday_tmax'], outputCol="features")
    regressor =  GBTRegressor(maxIter=50,maxDepth=5,featuresCol="features",labelCol="tmax")
    transquery = SQLTransformer(statement=query)
    pipeline = Pipeline(stages=[transquery,weather_assembler,regressor])
    model = pipeline.fit(train)
    model.write().overwrite().save(model_file)
 
    # use the model to make predictions
    predictions = model.transform(validation)
    #predictions.show()
    
    # evaluate the predictions
    r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax',
            metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax',
            metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)

    print('r2 =', r2)
    print('rmse =', rmse)
Ejemplo n.º 19
0
def train_gbt(train_data, test_data, label, file_to_save):
    route_arity = train_data.select('route').distinct().count()

    duration_gbt = GBTRegressor(labelCol=label, featuresCol="features")

    paramGrid = (ParamGridBuilder().addGrid(duration_gbt.maxDepth,
                                            [2, 4, 6]).addGrid(
                                                duration_gbt.maxBins,
                                                [300]).build())

    crossval = CrossValidator(estimator=duration_gbt,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(labelCol=label),
                              numFolds=5)

    cvModel = crossval.fit(train_data)

    bestModel = cvModel.bestModel
    maxDepth = bestModel._java_obj.getMaxDepth()
    maxBins = bestModel._java_obj.getMaxBins()

    gbt_params_folder_path = params_folder_path + 'GBT/'
    if not os.path.exists(gbt_params_folder_path):
        os.makedirs(gbt_params_folder_path)
    with open(gbt_params_folder_path + label + 'Params.csv', 'wb') as file:
        file.write("param, value" + '\n')
        file.write("maxDepth, " + str(maxDepth) + '\n')
        file.write("maxBins, " + str(maxBins) + '\n')

    save_test_info(cvModel.bestModel, test_data, label + "-gbt", file_to_save)

    return cvModel
Ejemplo n.º 20
0
def main(inputs, model_file):
    data = spark.read.csv(inputs, schema=schema())
    train, validation = data.randomSplit([0.75, 0.25], seed=42)

    sql_transformer1 = SQLTransformer(statement=yes_tmax())
    sql_transformer2 = SQLTransformer(statement=ret_query())
    assemble_features = VectorAssembler(inputCols=[
        'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax'
    ],
                                        outputCol='features')
    regressor = GBTRegressor(featuresCol='features', labelCol='tmax')
    pipeline = Pipeline(stages=[
        sql_transformer1, sql_transformer2, assemble_features, regressor
    ])

    model = pipeline.fit(train)
    predictions = model.transform(validation)
    model.write().overwrite().save(model_file)

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)

    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='tmax',
                                         metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)

    print("R-squared value : " + str(r2))
    print("RMSE value : " + str(rmse))
Ejemplo n.º 21
0
def main(inputs, model_file):
    sensor_data_df = spark.read.format("org.apache.spark.sql.cassandra").options(table=sensor_data_table,
                                                                                 keyspace=keyspace).load()
    # creating a ML pipeline

    sensor_data_df = sensor_data_df.select(sensor_data_df['datetime'],
                                         sensor_data_df['latitude'],
                                         sensor_data_df['longitude'],
                                         sensor_data_df['message_code_id'],
                                         sensor_data_df['sensor_reading'],
                                         sensor_data_df['sensor_name']).orderBy(sensor_data_df['datetime'].asc())
    train_set, validation_set = sensor_data_df.randomSplit([0.75, 0.25])
    train_set.catch()
    validation_set.catch()
    sql_transformer_statement = "SELECT latitude, longitude, sensor_name, sensor_reading, message_code_id" \
                                 "FROM __THIS__"

    sql_transformer = SQLTransformer(statement=sql_transformer_statement)
    assemble_features = VectorAssembler(inputCols=['latitude', 'longitude', 'sensor_name', 'sensor_reading']
                                        , outputCol= 'features')
    classifier = GBTRegressor(featuresCol='features', labelCol='message_code_id')
    pipeline = Pipeline(stages=[sql_transformer, assemble_features, classifier])
    model = pipeline.fit(train_set)

    predictions = model.tranform(validation_set)
    predictions.show()

    r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='message_code_id', metricName='r2')
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='message_code_id', metricName='rmse')
    r2_score = r2_evaluator.evaluate(predictions)
    rmse_score = rmse_evaluator.evaluate(predictions)
    print('r2 validation score : ', r2_score)
    print('rmse validation score: ', rmse_score)
Ejemplo n.º 22
0
def train_model():
    print_title("download winequality-red data!")
    download_dataset()
    print_title("load data!")
    # load iris.csv into Spark dataframe
    print("First 10 rows of Iris dataset:")
    lines = pd.read_csv("/tmp/winequality-red.csv")
    lines_df = spark.createDataFrame(lines)
    # convert features
    assembler = pyspark.ml.feature.VectorAssembler(inputCols=["fixed acidity", "volatile acidity","citric acid","residual sugar",
                                                             "chlorides","free sulfur dioxide","total sulfur dioxide","density",
                                                             "pH","sulphates","alcoho"], outputCol='features')
    # convert text labels into indices
    label_indexer = pyspark.ml.feature.StringIndexer(inputCol='quality', outputCol='label').fit(lines_df)

    # train
    gbt = GBTRegressor(featuresCol="features", maxIter=10)
    
    label_converter = pyspark.ml.feature.IndexToString(inputCol='prediction', outputCol='predictionClass', labels=label_indexer.labels)
    
    pipeline = Pipeline(stages=[assembler, label_indexer, gbt, label_converter])

    # fit the pipeline to training documents.
    model_local_path = os.path.join(LOCAL_MODEL_PATH, MODEL_NAME)
    model = pipeline.fit(lines_df)
    # save model
    model.save(model_local_path)
    metric_dict = {}
    calculate_metric_value(model, lines_df)
def main(inputs,output):
    tmax_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.DateType()),
    types.StructField('latitude', types.FloatType()),
    types.StructField('longitude', types.FloatType()),
    types.StructField('elevation', types.FloatType()),
    types.StructField('tmax', types.FloatType()),
    ])

    data = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    sqlTrans = SQLTransformer(statement = 'select *,dayofyear(date) as day FROM __THIS__')
 
    sqlTrans1 = SQLTransformer(statement = 'SELECT today.station,today.date,today.latitude,today.longitude,today.elevation,today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station')
    assemble_features = VectorAssembler(inputCols = ['latitude','longitude','elevation','day','yesterday_tmax'], outputCol = 'features')

    gbt = GBTRegressor(featuresCol = 'features', labelCol='tmax')
    pipeline = Pipeline(stages=[sqlTrans1,sqlTrans,assemble_features,gbt])
    weather_model = pipeline.fit(train)

    predictions = weather_model.transform(validation)
    #predictions.show()
    evaluator = RegressionEvaluator(labelCol = 'tmax', predictionCol = 'prediction', metricName = 'rmse')
    score = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % score)

    weather_model.write().overwrite().save(output)
Ejemplo n.º 24
0
 def pick_algorithm(self, algorithm, features_col, label_col):
     """."""
     if algorithm == 'linearregression':
         return LinearRegression(featuresCol=features_col,
                                 labelCol=label_col)
     elif algorithm == 'gbtregressor':
         return GBTRegressor(featuresCol=features_col, labelCol=label_col)
Ejemplo n.º 25
0
    def preprocessing(self):

        model = GBTRegressor(labelCol="bicycle_rentals")

        cols = [
            "part_time", "holiday", "week_days", "weather_description_mf",
            "month"
        ]

        imputer = Imputer(inputCols=["humidity", "pressure"],
                          outputCols=["humidity_input", "pressure_input"])

        indexers = [
            StringIndexer(inputCol=col, outputCol="{0}_indexed".format(col))
            for col in cols
        ]

        assembler = VectorAssembler(inputCols=[
            "part_time_indexed", "holiday_indexed", "month_indexed",
            "week_days_indexed", "weather_description_mf_indexed",
            "humidity_input", "pressure_input", "temperature", "wind_speed",
            "from_station_id", "mean_dpcapacity_start", "mean_dpcapacity_end",
            "sum_subscriber", "sum_customer"
        ],
                                    outputCol="features")

        pipeline = Pipeline(stages=[imputer] + indexers + [assembler] +
                            [model])

        return pipeline
Ejemplo n.º 26
0
def main():
    data = spark.range(100000)
    data = data.select(
        (functions.rand()*100).alias('length'),
        (functions.rand()*100).alias('width'),
        (functions.rand()*100).alias('height'),
    )
    data = data.withColumn('volume', data['length']*data['width']*data['height'])
    
    training, validation = data.randomSplit([0.75, 0.25], seed=42)
    
    assemble_features = VectorAssembler(
        inputCols=['length', 'width', 'height'],
        outputCol='features')
    classifier = GBTRegressor(
        featuresCol='features', labelCol='volume')
    pipeline = Pipeline(stages=[assemble_features, classifier])
    
    model = pipeline.fit(training)
    predictions = model.transform(validation)
    predictions.show()
    
    r2_evaluator = RegressionEvaluator(
        predictionCol='prediction', labelCol='volume',
        metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    print(r2)
Ejemplo n.º 27
0
def Distr_GBTRegressor(xy_train, xy_test):
    gr = GBTRegressor(minInstancesPerNode=20, maxDepth=25)
    evalu = RegressionEvaluator()
    grid_1 = ParamGridBuilder()\
            .addGrid(gr.maxIter, [100])\
            .addGrid(gr.subsamplingRate, [0.5,0.8,1.0])\
            .build()
    cv_1 = CrossValidator(estimator=gr,
                          estimatorParamMaps=grid_1,
                          evaluator=evalu,
                          numFolds=5)
    #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型
    cvModel_1 = cv_1.fit(xy_train)
    print "Grid scores: "
    best_params_1 = Get_best_params(cvModel_1, 'reg')['subsamplingRate']
    grid = ParamGridBuilder()\
            .addGrid(gr.maxIter, [110,120])\
            .addGrid(gr.subsamplingRate, [best_params_1,])\
            .build()
    cv = CrossValidator(estimator=gr,
                        estimatorParamMaps=grid,
                        evaluator=evalu,
                        numFolds=5)
    #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型
    cvModel = cv.fit(xy_train)
    best_params = Get_best_params(cvModel, 'reg')

    print "Best parameters set found: %s" % best_params

    return cvModel.bestModel
Ejemplo n.º 28
0
def train(df: DataFrame):
    def astraining(row: Row) -> Row:
        df = row.asDict()
        del df['Sales_Pred']
        del df['sales']
        sales = row.asDict()['sales']
        return Row(label=sales, features=list(df.values()))

    t3 = train.rdd \
        .filter(lambda r: r["sales"] is not None) \
        .map(astraining)

    gbt = GBTRegressor(maxIter=10)
    df = spark.createDataFrame(t3)
    df.show()
    gbt.fit(df)
    print("----------- after fit ------------")
Ejemplo n.º 29
0
 def model_define(self):
     """Returns a model with the hyperparameters inputted in :func:
     `get_parameters`
     Returns:
         (pyspark.ml.regression.GBTRegressor)
             Gradient Boosting Tree Regression model
     """
     return GBTRegressor()
Ejemplo n.º 30
0
def _get_xgboost_regressor_model(col, train):
    '''
    Gradient Boosted Tree Regressor Model is created for predicting Missing Values
    '''
    print(
        'Using Gradient Boosted Regressor Module to predict Missing Values ...'
    )
    reg_model = GBTRegressor(labelCol=col)
    #params = ParamGridBuilder().addGrid(reg_model.maxDepth, [5, 10, 20]).\
    #                            addGrid(reg_model.minInfoGain, [0.0, 0.01, 1.0]).\
    #                            addGrid(reg_model.maxBins, [32, 20, 50, 100, 300]).build()
    #cv = CrossValidator(estimator=reg_model,
    #                   estimatorParamMaps=params,
    #                   evaluator=RegressionEvaluator(labelCol=col),
    #                   numFolds=10)
    reg_model = reg_model.fit(train)
    return reg_model
def estimators(config):
    # All models to choose amongst for simple regression/classification
    model_type = config['base']['model_type']    
    model = config['base']['model']
    if model == 'rf':
        if model_type == 'classification':
            glm = RandomForestClassifier(
                        featuresCol = config['base']['featuresCol'],
                        labelCol = config['base']['labelCol'],
                        predictionCol = config['base']['predictionCol'],
                        numTrees = config['model']['numTrees'],
                        maxDepth = config['model']['maxDepth']
                        )
        elif model_type == 'regression':
            glm = RandomForestRegressor(
                        featuresCol = config['base']['featuresCol'],
                        labelCol = config['base']['labelCol'],
                        predictionCol = config['base']['predictionCol'],
                        numTrees = config['model']['numTrees'],
                        maxDepth = config['model']['maxDepth']
                        )
    if model == 'gbm':
        if model_type == 'classification':
            glm = GBTClassifier(
                        featuresCol = config['base']['featuresCol'],
                        labelCol = config['base']['labelCol'],
                        predictionCol = config['base']['predictionCol'],
                        lossType = config['model']['lossType'],
                        maxDepth = config['model']['maxDepth'],
                        stepSize = config['model']['stepSize']
                        )
        elif model_type == 'regression':
            glm = GBTRegressor(
                        featuresCol = config['base']['featuresCol'],
                        labelCol = config['base']['labelCol'],
                        predictionCol = config['base']['predictionCol'],
                        lossType = config['model']['lossType'],
                        maxDepth = config['model']['maxDepth'],
                        stepSize = config['model']['stepSize']
                        )
    if model == 'logistic':
        glm = LogisticRegression(
                    featuresCol = config['base']['featuresCol'],
                    labelCol = config['base']['labelCol'],
                    predictionCol = config['base']['predictionCol'],
                    threshold = config['model']['threshold'],
                    regParam = config['model']['regParam'],
                    elasticNetParam = config['model']['elasticNetParam']
                    )
    if model == 'linear':
        glm = LinearRegression(
                    featuresCol = config['base']['featuresCol'],
                    labelCol = config['base']['labelCol'],
                    predictionCol = config['base']['predictionCol'],
                    regParam = config['model']['regParam'],
                    elasticNetParam = config['model']['elasticNetParam']
                    )
    return glm
Ejemplo n.º 32
0
# COMMAND ----------

dtrModel = dtr.fit(irisPetal)
dtrPredictions = dtrModel.transform(irisPetal)
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'})
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'})

# COMMAND ----------

# MAGIC %md
# MAGIC Let's also build a gradient boosted tree.

# COMMAND ----------

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor().setLabelCol('petalWidth')
print gbt.explainParams()

# COMMAND ----------

gbtModel = gbt.fit(irisPetal)
gbtPredictions = gbtModel.transform(irisPetal)
print regEval.evaluate(gbtPredictions, {regEval.metricName: 'r2'})
print regEval.evaluate(gbtPredictions, {regEval.metricName: 'rmse'})

# COMMAND ----------

# MAGIC %md
# MAGIC We should really test our gradient boosted tree out-of-sample as it is easy to overfit with a GBT model.

# COMMAND ----------
Ejemplo n.º 33
0
# Select example rows to display.
predictions.select("prediction", "label").show(30,False)

evaluator = RegressionEvaluator(metricName="rmse")  # rmse (default)|mse|r2|mae
RMSE = evaluator.evaluate(predictions)
print 'RMSE: ' + str(RMSE)



#######################################################################################
#
#   Modeling - Gradient Boosting (Regression)
#
#######################################################################################

gbt = GBTRegressor(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)
#gbt = GBTClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)

gbtmodel = gbt.fit(training)

# Make predictions.
predictions = gbtmodel.transform(testing)

# Select example rows to display.
predictions.select("prediction", "label").show(30,False)

evaluator = RegressionEvaluator(metricName="rmse")  # rmse (default)|mse|r2|mae
RMSE = evaluator.evaluate(predictions)
print 'RMSE: ' + str(RMSE)

Ejemplo n.º 34
0
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  stages += [stringIndexer, encoder]

#encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend']
encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type']
for eCol in encColumns:
  encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec")
  stages += [encoder]
#label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label")
#stages += [label_stringIdx]

numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'total_amount', maxIter=10)
gbt_model = gbt.fit(dataset)
gbt_model.write().overwrite().save("./nyc-01020304-18-6vm-gbt-model")



# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
dtrModel = dtr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf =  RandomForestRegressor()
print rf.explainParams()
rfModel = rf.fit(df)
gbt = GBTRegressor()
print gbt.explainParams()
gbtModel = gbt.fit(df)


# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity")
pipeline = Pipeline().setStages([glr])
params = ParamGridBuilder().addGrid(glr.regParam, [0, 0.5, 1]).build()
evaluator = RegressionEvaluator()\
  .setMetricName("rmse")\
Ejemplo n.º 36
0
    print datatype

var_target   = 'rating'
var_features = [col for col in enriched1.columns if col not in ['userId','movieId','rating','timestamp','title','tag']]

# Generate Features Vector and Label
va = VectorAssembler(inputCols=var_features, outputCol="features")

modelprep1 = va.transform(enriched1).select('userId','movieId','rating','features')

training, testing, other = modelprep1.randomSplit([0.07, 0.03, 0.90])

print '[ INFO ] Training:          ' + str(training.count()) + ' records'
print '[ INFO ] Testing:           ' + str(training.count()) + ' records'

gb = GBTRegressor(featuresCol="features", labelCol=var_target, predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)

gbmodel = gb.fit(training)
#gbmodel.save('/tmp/spark_models/kaggle_bike_sharing_gb_model')

predictions = gbmodel.transform(testing)

print '[ INFO ] Printing predictions vs label...'
predictions.show(10,False).select('prediction',var_target)

evaluator = RegressionEvaluator(labelCol=var_target, predictionCol="prediction")
print '[ INFO ] Model Fit (RMSE):  ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))
#print '[ INFO ] Model Fit (MSE):   ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "mse"}))
#print '[ INFO ] Model Fit (R2):    ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "r2"}))

total_runtime_seconds = (datetime.datetime.now() - start_time).seconds