Beispiel #1
0
    output_testing = assembler.transform(df_testing)

    final_data_training = output_training.select('features', 'demand')
    final_data_testing = output_testing.select('features', 'demand')

    #final_data_training.describe().show()
    #final_data_testing.describe().show()
    """  Model and predictions : """
    decisionTree = DecisionTreeRegressor(labelCol='demand', maxDepth=3)
    dt_model = decisionTree.fit(final_data_training)
    predictions = dt_model.transform(final_data_testing)
    #print("Decision tree model max depth = %g" % decisionTree.getMaxDepth())
    #print(dt_model.toDebugString)
    """ Evaluation rmse : """
    evaluatorRMSE = RegressionEvaluator(labelCol="demand",
                                        predictionCol="prediction",
                                        metricName="rmse")
    rmse = evaluatorRMSE.evaluate(predictions)
    errorsRMSE.append(rmse)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    evaluatorR2 = RegressionEvaluator(labelCol="demand",
                                      predictionCol="prediction",
                                      metricName="r2")
    r2 = evaluatorR2.evaluate(predictions)
    errorsR2.append(r2)
    print("R Squared Error (R2) on test data = %g" % r2)
""" Writing the errors in the files : """
file = open("decision_tree_rmse.txt", "w")
file.write("Training set contains " + str(N_DAYS_TRAIN) + " days i.e. " +
           str(N_OF_TIME_SLOTS_TRAIN) + " time slots \nTest set contains " +
Beispiel #2
0
pipeline = Pipeline(stages=[glm])
pipeline2 = Pipeline(stages=[rfr])

# COMMAND ----------

paramGrid = ParamGridBuilder().addGrid(glm.maxIter, [8, 10, 12]).addGrid(
    glm.regParam, [0.4, 0.6, 0.8]).build()
paramGrid2 = ParamGridBuilder().addGrid(rfr.maxDepth, [20, 25]).addGrid(
    rfr.maxBins, [32, 48]).build()

# COMMAND ----------

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName="mae"),
                          numFolds=5)
crossval2 = CrossValidator(estimator=pipeline2,
                           estimatorParamMaps=paramGrid2,
                           evaluator=RegressionEvaluator(metricName="mae"),
                           numFolds=5)

# COMMAND ----------

trainingDataSJ = trainingData2.filter("city == 'sj'")
trainingDataIQ = trainingData2.filter("city == 'iq'")
testingDataSJ = testingData2.filter("city == 'sj'")
testingDataIQ = testingData2.filter("city == 'iq'")
#testingData2SJ = testingData2.filter("city == 'sj'")
#testingData2IQ = testingData2.filter("city == 'iq'")
# In[128]:

(trainingData, testData) = vectorDf.randomSplit([0.7, 0.3])

# In[129]:

model = rf.fit(trainingData)

# In[130]:

predictions = model.transform(testData)

# In[132]:

evaluator = RegressionEvaluator(labelCol="score",
                                predictionCol="prediction",
                                metricName="mae")
rmse = evaluator.evaluate(predictions)

# In[133]:

print 'Mean absolute error in number of up votes is %.2f' % rmse
# Add in post length

# In[55]:

stringIndexer = StringIndexer(inputCol="subreddit", outputCol="subredditIndex")
model = stringIndexer.fit(dfClean)
indexed = model.transform(dfClean)

#encoder = OneHotEncoder(inputCol="subredditIndex", outputCol="subredditVec")
Beispiel #4
0
def rf_pipeline(rides_metar_joined, model_type, grid_dict, numFolds=5):

    rf = RandomForestRegressor(featuresCol='features', labelCol='count_scaled')

    # Build pipeline.
    if model_type == '1':
        assembler = VectorAssembler(inputCols=[
            'grid_x', 'grid_y', 'weekday', 'hour', 'fahrenheit', 'precip_in'
        ],
                                    outputCol='features')
        pipeline = Pipeline(stages=[assembler, rf])
    elif model_type == '2':
        encoder = OneHotEncoderEstimator(
            inputCols=['weekday', 'hour'],
            outputCols=['weekday_vec', 'hour_vec'])
        assembler = VectorAssembler(inputCols=[
            'grid_x', 'grid_y', 'weekday_vec', 'hour_vec', 'fahrenheit',
            'precip_in'
        ],
                                    outputCol='features')
        pipeline = Pipeline(stages=[encoder, assembler, rf])
    elif model_type == '3':
        encoder = OneHotEncoderEstimator(
            inputCols=['weekday', 'hour', 'grid_x', 'grid_y'],
            outputCols=['weekday_vec', 'hour_vec', 'grid_x_vec', 'grid_y_vec'])
        assembler = VectorAssembler(inputCols=[
            'grid_x_vec', 'grid_y_vec', 'weekday_vec', 'hour_vec',
            'fahrenheit', 'precip_in'
        ],
                                    outputCol='features')
        pipeline = Pipeline(stages=[encoder, assembler, rf])
    else:
        raise ValueError("Model type must be either 1, 2, or 3.")

    # TODO: is there a random search module?
    # start with numTrees: [10, 100, 1000]
    # maxDepth: [10, 100, 1000]
    # minInstancesPerNode: [1, 10, 100, 1000]
    paramGrid = ParamGridBuilder().addGrid(
        rf.numTrees, grid_dict['numTrees']).addGrid(
            rf.maxDepth, grid_dict['maxDepth']).addGrid(
                rf.minInstancesPerNode,
                grid_dict['minInstancesPerNode']).build()

    train, dev, test = rides_metar_joined.randomSplit([1 / 3] * 3)

    evaluator = RegressionEvaluator(labelCol='count_scaled',
                                    predictionCol='prediction',
                                    metricName='rmse')

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=numFolds)

    model = crossval.fit(train)
    pred_on_test = model.transform(test)
    rmse_on_test = evaluator.evaluate(pred_on_test)
    # Test these locally!
    #mean_error = pred_on_test.agg(pyspark.sql.functions.abs(col('count') - col('prediction')))
    #error_stddev = pred_on_test.agg(stddev(col('count') - col('prediction')))

    # get metric values with model.avgMetrics,
    # and parameters with model.getEstimatorParamMaps
    return model, pred_on_test, rmse_on_test
Beispiel #5
0
df = spark.createDataFrame(pdf)
(training, testing) = df.randomSplit([0.7, 0.3])

# Create a recommendations model using the ALS algorithm on the training data.
als = ALS(userCol='userid', itemCol='movieid', ratingCol='rating', coldStartStrategy='drop')

# Build a parameter grid to assign a range of values to
# the given ALS parameters.
output = ParamGridBuilder() \
             .addGrid(als.rank, [10, 11, 12]) \
             .addGrid(als.maxIter, [6, 7, 8]) \
             .addGrid(als.regParam, [.16, .17, .18]) \
             .build()

# Use the regression evaluator and fit the model to the training data.
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=output, evaluator=evaluator)
model = tvs.fit(training)
tuned_model = model.bestModel
predictions = tuned_model.transform(testing)

# Get the size of the dataset.
query = 'SELECT COUNT(*) FROM ratings;'
pdf = pd.read_sql_query(query, conn)
df = spark.createDataFrame(pdf).collect()

for row in df:
	count = int(row[0])

# Get the total number of movies a user has rated.
users = [1, 613, 614, 615, 616, 617, 628, 414]
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

# Regression
from pyspark.ml.regression import LinearRegression
regression = LinearRegression(labelCol='consumption')

regression = regression.fit(cars_train)
predictions = regression.transform(cars_test)

# Calculate RMSE
from pyspark.ml.evaluation import RegressionEvaluator
# Find RMSE
RegressionEvaluator(labelCol='consumption').evaluate(prediction)
# Other metrics: mae, r2, mse
# Examine intercept
print(regression.intercept)
# Examine Coefficients
print(regression.coefficients)

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)
#Find correlations
numFeatures = autoDF.take(1)[0].features.size
labelRDD = autoDF.map(lambda lp: float(lp.label))
for i in range(numFeatures):
    featureRDD = autoDF.map(lambda lp: lp.features[i])
    corr = Statistics.corr(labelRDD, featureRDD, 'pearson')
    print('%d\t%g' % (i, corr))

#Split into training and testing data
(trainingData, testData) = autoDF.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()

#Build the model on training data
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

#Predict on the test data
predictions = lrModel.transform(testData)
predictions.select("prediction", "label", "features").show()

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
evaluator.evaluate(predictions)
Beispiel #8
0
 def test_evaluate_invalid_type(self):
     evaluator = RegressionEvaluator(metricName="r2")
     df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
     invalid_type = ""
     self.assertRaises(TypeError, evaluator.evaluate, df, invalid_type)
    def build_recommendation_model(self):
        logging.info("getting distinct users")
        print_with_time("getting distinct users")
        users = self.df.select(["user_id"]).distinct()

        logging.info("getting distinct items")
        print_with_time("getting distinct items")
        items = self.df.select(["item_id"]).distinct()

        logging.info("mapping user_id to number")
        print_with_time("mapping user_id to number")
        user_indexer = StringIndexer(inputCol="user_id",
                                     outputCol="user_id_no")
        self.user_indexed = user_indexer.fit(users).transform(users)
        self.user_indexed = self.user_indexed.select(
            self.user_indexed.user_id.cast("string"),
            self.user_indexed.user_id_no.cast("int"))

        logging.info("mapping item_id to number")
        print_with_time("mapping item_id to number")
        item_indexer = StringIndexer(inputCol="item_id",
                                     outputCol="item_id_no")
        self.item_indexed = item_indexer.fit(items).transform(items)
        self.item_indexed = self.item_indexed.select(
            self.item_indexed.item_id.cast("string"),
            self.item_indexed.item_id_no.cast("int"))

        logging.info("joining df with user_indexed rdd")
        print_with_time("joining df with user_indexed rdd")
        self.df = self.df.join(self.user_indexed, ["user_id"], 'inner')

        logging.info("joining df with item_indexed rdd")
        print_with_time("joining df with item_indexed rdd")
        self.df = self.df.join(self.item_indexed, ["item_id"], 'inner')
        self.df = self.df.select(["item_id_no", "user_id_no", "rating"])

        ############

        logging.info("splitting dataset into training and testing")
        print_with_time("splitting dataset into training and testing")
        (training, validation, test) = self.df.randomSplit([0.6, 0.2, 0.2])

        ######

        ranks = [25, 50, 100]
        regParam = [0.1, 0.01, 0.001]
        all_params = [(rank, reg) for rank in ranks for reg in regParam]

        min_mpr = float('inf')
        best_rank = -1
        best_reg = -1
        for (iteration_no, (rank, reg)) in enumerate(all_params):

            logging.info(iteration_no)
            print_with_time(str(iteration_no))
            logging.info("rank=%s, reg=%s " % (rank, reg))
            print_with_time("rank=%s, reg=%s " % (rank, reg))

            als = ALS(rank=rank,
                      regParam=reg,
                      nonnegative=True,
                      implicitPrefs=True,
                      userCol="user_id_no",
                      itemCol="item_id_no",
                      checkpointInterval=-1,
                      coldStartStrategy="drop",
                      ratingCol="rating")
            self.model = als.fit(training)

            logging.info("transforming the validation set")
            print_with_time("transforming the validation set")
            predictions = self.model.transform(validation)

            logging.info("getting rmse on validation set")
            print_with_time("getting rmse on validation set")

            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            logging.info("Root-mean-square error = " + str(rmse))
            print_with_time("Root-mean-square error = " + str(rmse))

            logging.info("getting MPR on validation set")
            print_with_time("getting MPR on validation set")

            ev = RankBasedEvaluator2("user_id_no", "rating", "prediction")
            mpr = ev.evaluate(sqlContext, predictions)
            logging.info("Mean Percentile Ranking = " + str(mpr))
            print_with_time("Mean Percentile Ranking = " + str(mpr))

            if mpr < min_mpr:
                min_mpr = mpr
                best_rank = rank
                best_reg = reg

        logging.info('The best model was trained with rank %s and reg %s' %
                     (best_rank, best_reg))
        print_with_time('The best model was trained with rank %s and reg %s' %
                        (best_rank, best_reg))

        ######

        logging.info("starting model training")
        print_with_time("starting model training")

        als = ALS(rank=best_rank,
                  regParam=best_reg,
                  nonnegative=True,
                  implicitPrefs=True,
                  userCol="user_id_no",
                  itemCol="item_id_no",
                  checkpointInterval=-1,
                  coldStartStrategy="drop",
                  ratingCol="rating")
        self.model = als.fit(training)

        logging.info("transforming the test set")
        print_with_time("transforming the test set")
        predictions = self.model.transform(test)

        logging.info("getting rmse on test set")
        print_with_time("getting rmse on test set")

        evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="rating",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)
        logging.info("Root-mean-square error = " + str(rmse))
        print_with_time("Root-mean-square error = " + str(rmse))

        logging.info("getting MPR on test set")
        print_with_time("getting MPR on test set")
        ev = RankBasedEvaluator2("user_id_no", "rating", "prediction")
        mpr = ev.evaluate(sqlContext, predictions)
        logging.info("Mean Percentile Ranking = " + str(mpr))
        print_with_time("Mean Percentile Ranking = " + str(mpr))
                              maxCategories=4)

display(train.select("hr", "cnt"))

gbt = GBTRegressor(labelCol="cnt")

# Define a grid of hyperparameters to test:
#  - maxDepth: max depth of each decision tree in the GBT ensemble
#  - maxIter: iterations, i.e., number of trees in each GBT ensemble
# In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth,
                                       [2, 5]).addGrid(gbt.maxIter,
                                                       [10, 100]).build()
# We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=gbt.getLabelCol(),
                                predictionCol=gbt.getPredictionCol())
# Declare the CrossValidator, which runs model tuning for us.
cv = CrossValidator(estimator=gbt,
                    evaluator=evaluator,
                    estimatorParamMaps=paramGrid)

pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])

# fit the training data to the pipeline...
pipelineModel = pipeline.fit(train)

predictions = pipelineModel.transform(test)

display(predictions.select("cnt", "prediction", *featuresCols))
predictions.select("cnt", "prediction", *featuresCols).show()
Beispiel #11
0
    print('duration', best_params[0])
    print('RMSE', best_params[1])
    print('调整后最佳参数:')
    print('rank', best_params[2])
    print('max_iter:', best_params[3])
    print('reg_param:', best_params[4])
    return best_params[5]  # best_model


if __name__ == '__main__':

    def create_spark():
        spark = SparkSession.builder.master("local[*]").appName('movie_recommend') \
            .config("spark.executor.memory", "16g").config("spark.network.timeout", "20000s") \
            .config("spark.executor.heartbeatInterval", "10000s").config('spark.driver.memory', '16g') \
            .getOrCreate()
        return spark

    spark = create_spark()
    evaluator = RegressionEvaluator(labelCol='rating')
    train_data = screening_ratings(spark)
    validation_data = prepare_data(train_data)[1]
    validation_data.cache()

    # 50, 20, 0.1 => 45, 18, 0.1 =>45, 20, 0.1 =>48, 20, 0.1
    model = get_best_model()
    localtime = time.localtime(time.time())
    model.save(
        'file:///Projects/python_projects/movie_recommendation/model/best_als_model_%s_%s_%s'
        % localtime[:3])
# MAGIC To start, we'll generate the predictions by using the first model in `petalModels`.

# COMMAND ----------

petalPredictions = petalModels[0].transform(irisPetal)
display(petalPredictions)

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we'll evaluate the model using the `RegressionEvaluator`.

# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
regEval = RegressionEvaluator().setLabelCol('petalWidth')

print regEval.explainParams()

# COMMAND ----------

# MAGIC %md
# MAGIC The default value for `RegressionEvaluator` is root mean square error (RMSE).  Let's view that first.

# COMMAND ----------

print regEval.evaluate(petalPredictions)

# COMMAND ----------

# MAGIC %md
Beispiel #13
0
from pyspark.ml.recommendation import ALS

als = ALS()
"""
class pyspark.ml.recommendation.ALS(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, implicitPrefs=false, alpha=1.0, userCol="user", itemCol="item", seed=None, ratingCol="rating", nonnegative=false, checkpointInterval=10)[source]
# Now we set the parameters for the method
"""

als.setMaxIter(5).setSeed(seed).setRegParam(0.1).setItemCol(
    "movieID").setUserCol("userID").setRatingCol("rating")

from pyspark.ml.evaluation import RegressionEvaluator

reg_eval = RegressionEvaluator(predictionCol="prediction",
                               labelCol="rating",
                               metricName="rmse")

tolerance = 0.03
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
    als.setRank(rank)
    model = als.fit(trainDF)
    predictDF = model.transform(validationDF)

    predicted_ratings_df = predictDF.filter(
Beispiel #14
0
# COMMAND ----------

resultsDtDf = dtModel.transform(testSetDF)
resultsDtDf.write.save('/mnt/data/resultsDtDf.parquet',
                       format='parquet',
                       header=True,
                       mode="overwrite")

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
regEval = RegressionEvaluator(predictionCol="Prediction_cuisine",
                              labelCol="6714",
                              metricName="rmse")

# We can reuse the RegressionEvaluator, regEval, to judge the model based on the best Root Mean Squared Error
# Let's create our CrossValidator with 3 fold cross validation
crossval = CrossValidator(estimator=dtPipeline, evaluator=regEval, numFolds=3)

# Let's tune over our dt.maxDepth parameter on the values 2 and 3, create a paramter grid using the ParamGridBuilder
paramGrid = (ParamGridBuilder().addGrid(dt.maxDepth, [2, 3]).build())

# Add the grid to the CrossValidator
crossval.setEstimatorParamMaps(paramGrid)

# Now let's find and return the best model
dtModelBest = crossval.fit(trainingSetDF).bestModel
Beispiel #15
0
def get_rmse(sdf):
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    return evaluator.evaluate(sdf)
    # Train model.  This also runs the indexers
    model = pipeline.fit(data)

    print("After model fit~~~~~~~~~~~~~~~~~")

    # Make predictions.
    predictions = model.transform(data)

    # # Select example rows to display.
    # predictions.select("prediction", target).show(5)

    predictions.select(target, "prediction",).write.mode("overwrite").format("com.databricks.spark.csv").save(output_file)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(predictionCol = "prediction", labelCol = target)
    
    # mse|rmse|r2|mae
    mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
    mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})

    print("mse:", mse, " rmse:", rmse, " r2:", r2, " mae:", mae)
    
    # # Load the testing data into spark
    # realData = sqlContext.read.format("com.databricks.spark.csv").options(header="true", inferschema="true").load("test/test_bank.csv")

    # # Add one empty column named "TARGET" to match the same schema between training/testing data
    # new_realData = realData.withColumn("TARGET", realData["ID"].cast(DoubleType()))


  
  
hyper={lm.maxIter: 10, poly.degree:3}

def build_ParamGrid():
  """Build a parameter grid from"""
  cvParamGrid = ParamGridBuilder() \
              .addGrid(lm.maxIter, [10, 20]) \
              .build()  



re = RegressionEvaluator()




crossval = CrossValidator(estimator=pipe,
              estimatorParamMaps=cvParamGrid,
              evaluator=re,
              **cv)

model = crossval.fit(dat)

# Example of changes to gs
#----
# Only the parameters related to grid are inclued.
Beispiel #18
0
def test_param_search_estimator(  # pylint: disable=unused-argument
        metric_name, param_search_estimator, spark_session,
        dataset_regression):
    mlflow.pyspark.ml.autolog()
    lr = LinearRegression(solver="l-bfgs", regParam=0.01)
    lrParamMaps = [
        {
            lr.maxIter: 1,
            lr.standardization: False
        },
        {
            lr.maxIter: 200,
            lr.standardization: True
        },
        {
            lr.maxIter: 2,
            lr.standardization: False
        },
    ]
    best_params = {
        "LinearRegression.maxIter": 200,
        "LinearRegression.standardization": True
    }
    eva = RegressionEvaluator(metricName=metric_name)
    estimator = param_search_estimator(estimator=lr,
                                       estimatorParamMaps=lrParamMaps,
                                       evaluator=eva)
    with mlflow.start_run() as run:
        model = estimator.fit(dataset_regression)
        estimator_info = load_json_artifact("estimator_info.json")
        metadata = _gen_estimator_metadata(estimator)
        assert metadata.hierarchy == estimator_info["hierarchy"]

        param_search_estiamtor_info = estimator_info[
            metadata.uid_to_indexed_name_map[estimator.uid]]
        assert param_search_estiamtor_info[
            "tuned_estimator_parameter_map"] == _get_instance_param_map_recursively(
                lr, 1, metadata.uid_to_indexed_name_map)
        assert param_search_estiamtor_info[
            "tuning_parameter_map_list"] == _get_tuning_param_maps(
                estimator, metadata.uid_to_indexed_name_map)

        assert best_params == load_json_artifact("best_parameters.json")

        search_results = load_json_csv("search_results.csv")

    uid_to_indexed_name_map = metadata.uid_to_indexed_name_map
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values({
            **_get_instance_param_map(estimator, uid_to_indexed_name_map),
            **{f"best_{k}": v
               for k, v in best_params.items()},
        }))
    assert run_data.tags == get_expected_class_tags(estimator)
    assert MODEL_DIR in run_data.artifacts
    loaded_model = load_model_by_run_id(run_id)
    assert loaded_model.stages[0].uid == model.uid
    loaded_best_model = load_model_by_run_id(run_id, "best_model")
    assert loaded_best_model.stages[0].uid == model.bestModel.uid
    assert run_data.artifacts == [
        "best_model",
        "best_parameters.json",
        "estimator_info.json",
        "model",
        "search_results.csv",
    ]

    client = mlflow.tracking.MlflowClient()
    child_runs = client.search_runs(
        run.info.experiment_id,
        "tags.`mlflow.parentRunId` = '{}'".format(run_id))
    assert len(child_runs) == len(search_results)

    for row_index, row in search_results.iterrows():
        row_params = json.loads(row.get("params", "{}"))
        for param_name, param_value in row_params.items():
            assert param_value == row.get(f"param.{param_name}")

        params_search_clause = " and ".join([
            "params.`{}` = '{}'".format(key.split(".")[1], value)
            for key, value in row_params.items()
        ])
        search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format(
            run_id, params_search_clause)
        child_runs = client.search_runs(run.info.experiment_id, search_filter)
        assert len(child_runs) == 1
        child_run = child_runs[0]
        assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED)
        run_data = get_run_data(child_run.info.run_id)
        child_estimator = estimator.getEstimator().copy(
            estimator.getEstimatorParamMaps()[row_index])
        assert run_data.tags == get_expected_class_tags(child_estimator)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values({
                **_get_instance_param_map(child_estimator, uid_to_indexed_name_map)
            }))
        assert (child_run.data.tags.get(MLFLOW_AUTOLOGGING) ==
                mlflow.pyspark.ml.AUTOLOGGING_INTEGRATION_NAME)

        metric_name = estimator.getEvaluator().getMetricName()
        if isinstance(estimator, CrossValidator):
            avg_metric_value = model.avgMetrics[row_index]
            avg_metric_name = f"avg_{metric_name}"
        else:
            avg_metric_value = model.validationMetrics[row_index]
            avg_metric_name = metric_name

        assert math.isclose(avg_metric_value,
                            run_data.metrics[avg_metric_name],
                            rel_tol=1e-6)
        assert math.isclose(avg_metric_value,
                            float(row.get(avg_metric_name)),
                            rel_tol=1e-6)

        if isinstance(estimator, CrossValidator) and Version(
                pyspark.__version__) > Version("3.2"):
            std_metric_name = f"std_{metric_name}"
            std_metric_value = model.stdMetrics[row_index]
            assert math.isclose(std_metric_value,
                                run_data.metrics[std_metric_name],
                                rel_tol=1e-6)
            assert math.isclose(std_metric_value,
                                float(row.get(std_metric_name)),
                                rel_tol=1e-6)
Beispiel #19
0
                            predictionCol=prediction_Col_Name)

        # Linear_Regressor = [vecAssembler, lr]
        # Random_Forest = [vecAssembler, rfr]
        # DecisionTree_Regressor = [vecAssembler, dtr]
        GBT_Regressor = [vecAssembler, gbtr]

        models = [
            # ('Linear Regressor', Pipeline(stages=Linear_Regressor)),
            # ('Random Forest Regressor', Pipeline(stages=Random_Forest)),
            # ('Decision Tree Regressor', Pipeline(stages=DecisionTree_Regressor)),
            ('GBT Regressor', Pipeline(stages=GBT_Regressor)),
        ]

        evaluator = RegressionEvaluator(predictionCol=prediction_Col_Name,
                                        labelCol="max_value",
                                        metricName="mse")

        # split = df.randomSplit([0.80, 0.20])
        # train = split[0]
        # test = split[1]
        # train = train.cache()
        # test = test.cache()

        # min = 1000
        # for label, pipeline in models:
        #     model = pipeline.fit(df)
        #     pred = model.transform(df)
        #     score = evaluator.evaluate(pred)d
        #     # print("\nCriteria Gas", criteria_gas)
        #     print(label, score)
# Flight duration model: Adding origin airport
# Some airports are busier than others. Some airports are bigger than others too. Flights departing from large or busy airports are likely to spend more time taxiing or waiting for their takeoff slot. So it stands to reason that the duration of a flight might depend not only on the distance being covered but also the airport from which the flight departs.

# You are going to make the regression model a little more sophisticated by including the departure airport as a predictor.

# These data have been split into training and testing sets and are available as flights_train and flights_test. The origin airport, stored in the org column, has been indexed into org_idx, which in turn has been one-hot encoded into org_dummy. The first few records are displayed in the terminal.

# Instructions
# 100 XP
# Fit a linear regression model to the training data.
# Make predictions for the testing data.
# Calculate the RMSE for predictions on the testing data.

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
RegressionEvaluator(labelCol='duration').evaluate(predictions)
Beispiel #21
0

myseed = 200206518

als_50 = ALS(userCol="userId", itemCol="movieId", seed=myseed, coldStartStrategy="drop")

# Training the model
model_50 = als_50.fit(train)

#Perdictions
predictions_50 = model_50.transform(test)
predictions_50 = predictions_50.cache()

print("Evaluation for 50/50 split")
## Question 2.A.3 for time-split 50%
evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse_50 = evaluator_rmse.evaluate(predictions_50)
print("Root-mean-square error = " + str(rmse_50))

evaluator_mse = RegressionEvaluator(metricName="mse", labelCol="rating",predictionCol="prediction")
mse_50 = evaluator_mse.evaluate(predictions_50)
print("Mean-square error = " + str(mse_50))

evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="rating",predictionCol="prediction")
mae_50 = evaluator_mae.evaluate(predictions_50)
print("Mean-Absolute error = " + str(mae_50))

############################################################################################################
################################## ASL Variation number 1 ##################################################
############################################################################################################
print("numIterations: %d" % trainingSummary.totalIterations)
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

lrPredictions=lrModel.transform(testingData);
lrPredictions.select("prediction", "sales", "std_features").show(5)

paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(predictionCol='prediction', labelCol='label',metricName= "r2"),
                          numFolds=3)

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainingData)

print(cvModel.avgMetrics)
#print( cvModel.bestModel.stages[2].summary.r2)

for param in paramGrid:
    print param


cvPrediction = cvModel.transform(testingData)
cvPrediction.select("prediction", "sales", "std_features").show(5)
def main(spark):
    train_data = spark.read.parquet(
        'hdfs:/user/bm106/pub/project/cf_train.parquet')
    val_data = spark.read.parquet(
        'hdfs:/user/bm106/pub/project/cf_validation.parquet')

    train_data.createOrReplaceTempView('train_data')

    train_data_log = train_data.withColumn("logcount",
                                           log(train_data["count"]))
    val_data_log = val_data.withColumn("logcount", log(val_data["count"]))
    uid_indexer = StringIndexer(inputCol="user_id",
                                outputCol="user_num",
                                handleInvalid="skip")
    tid_indexer = StringIndexer(inputCol="track_id",
                                outputCol="track_num",
                                handleInvalid="skip")

    ranks = [4]
    regs = [1]
    alphas = [0.5]

    best_rmse = None
    best_rank = None
    best_alpha = None
    best_reg = None

    for rank in ranks:
        for alpha in alphas:
            for reg in regs:

                als = ALS(maxIter=3,
                          regParam=reg,
                          userCol="user_num",
                          itemCol="track_num",
                          ratingCol="logcount",
                          implicitPrefs=True,
                          coldStartStrategy="drop",
                          alpha=alpha,
                          rank=rank)
                print('model created')
                pipeline = Pipeline(stages=[uid_indexer, tid_indexer, als])
                print('pipeline created')
                als_model = pipeline.fit(train_data_log)
                print('model fit')
                predictions = als_model.transform(val_data_log)
                print('predictions created')
                evaluator = RegressionEvaluator(metricName="rmse",
                                                labelCol="count",
                                                predictionCol="prediction")
                print('evaluator created')
                rmse = evaluator.evaluate(predictions)
                print('evaluation ', rmse)
                if best_rmse is None or best_rmse > rmse:
                    best_rmse = rmse
                    best_rank = rank
                    best_alpha = alpha
                    best_reg = reg

    print('The best hyper parameters: Rank: {}, Reg: {}, Alpha: {}, RMSE: {}'.
          format(best_rank, best_reg, best_alpha, best_rmse))
    #The best hyper parameters: Rank: 4, Reg: 0.5, Alpha: 1, RMSE: 7.71668081677277

    als_model.save('anshul_project/log_model')
Beispiel #24
0
#Vectors 함수를 이용해서 벡터화 하기
adV = adDF.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label'])

adV.show()
adV.printSchema()

# 회귀분석 모델을 적용하기
# lr = LinearRegression(featuresCol='features', labelCol='label', regParam = 0.3, elasticNetParam = 0.8)
lr = LinearRegression(featuresCol='features', labelCol='label')
lr_model = lr.fit(adV)
lr_model.save

#모델 적용하기
pred = lr_model.transform(adV)
pred.show(5)

#분석결과 보기
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
evaluator.setMetricName('r2').evaluate(pred)

#테스트 셋과 트레이닝 셋으로 분리하기 1 
train, test = adV.randomSplit([0.7, 0.3])

lr = LinearRegression() 
lr_model = lr.fit(train)

testFit = lr_model.transform(test)

testFit.show()

def main():

    transaction_df = spark.read.format("org.apache.spark.sql.cassandra") \
    .options(table='transaction_data5', keyspace='bike_share_analytics').load()

    transaction_df = transaction_df.filter(transaction_df.startyear >= 2017)

    weather_transaction = transaction_df.withColumn(
        'startdate', functions.to_date(transaction_df['starttime']))
    weather_transaction = weather_transaction.dropna()
    weather_transaction.createOrReplaceTempView('weather_transaction')

    weather_transaction = spark.sql(
        "SELECT `start station id` ,`start station name`,starthour,startmonth,startdate,weekday,weekend,temperature,humidity,windspeed,COUNT(*) AS number_of_trips FROM weather_transaction GROUP BY `start station id`,`start station name`,starthour,startdate,startmonth,weekday,weekend,temperature,humidity,windspeed"
    )
    weather_transaction.createOrReplaceTempView('weather_transaction')

    data = spark.sql(
        "SELECT past.`start station id` AS sstid, past.starthour AS starthour, past.startdate AS startdate, past.startmonth AS startmonth, past.weekday AS weekday, past.weekend AS weekend,past.temperature AS temperature,past.humidity AS humidity, past.windspeed AS windspeed, past.number_of_trips AS past_trips, current.number_of_trips AS number_of_trips FROM weather_transaction past JOIN weather_transaction current ON past.`start station id` = current.`start station id` AND past.starthour = current.starthour - 1 AND past.startdate = current.startdate"
    )
    data.createOrReplaceTempView('data')

    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    citi_assembler = VectorAssembler(inputCols=[
        'sstid', 'weekday', 'weekend', 'starthour', 'startmonth',
        'temperature', 'humidity', 'windspeed', 'past_trips'
    ],
                                     outputCol='features')

    decisionTreeRegressor = DecisionTreeRegressor(featuresCol='features',
                                                  labelCol='number_of_trips')
    gbtRegressor = GBTRegressor(featuresCol='features',
                                labelCol='number_of_trips')
    randomForestRegressor = RandomForestRegressor(featuresCol='features',
                                                  labelCol='number_of_trips')

    decisionTree_pipeline = Pipeline(
        stages=[citi_assembler, decisionTreeRegressor])
    gbt_pipeline = Pipeline(stages=[citi_assembler, gbtRegressor])
    randomForest_pipeline = Pipeline(
        stages=[citi_assembler, randomForestRegressor])

    decisionTree_model = decisionTree_pipeline.fit(train)
    gbt_model = gbt_pipeline.fit(train)
    randomForest_model = randomForest_pipeline.fit(train)

    decisionTree_prediction = decisionTree_model.transform(validation)
    decisionTree_prediction = decisionTree_prediction.drop(
        'features', 'temperature', 'humidity', 'windspeed')
    decisionTree_prediction.write.format("org.apache.spark.sql.cassandra") \
    .options(table='dtree_weather', keyspace='bike_share_analytics').save()

    gbt_prediction = gbt_model.transform(validation)
    gbt_prediction = gbt_prediction.drop('features', 'temperature', 'humidity',
                                         'windspeed')
    gbt_prediction.write.format("org.apache.spark.sql.cassandra") \
    .options(table='gbt_weather', keyspace='bike_share_analytics').save()

    randomForest_prediction = randomForest_model.transform(validation)
    randomForest_prediction = randomForest_prediction.drop(
        'features', 'temperature', 'humidity', 'windspeed')
    randomForest_prediction.write.format("org.apache.spark.sql.cassandra") \
    .options(table='rf_weather', keyspace='bike_share_analytics').save()

    # evaluate the predictions
    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='number_of_trips',
                                       metricName='r2')
    r2_decisionTree = r2_evaluator.evaluate(decisionTree_prediction)
    r2_gbt = r2_evaluator.evaluate(gbt_prediction)
    r2_randomForest = r2_evaluator.evaluate(randomForest_prediction)

    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='number_of_trips',
                                         metricName='rmse')
    rmse_decisionTree = rmse_evaluator.evaluate(decisionTree_prediction)
    rmse_gbt = rmse_evaluator.evaluate(gbt_prediction)
    rmse_randomForest = rmse_evaluator.evaluate(randomForest_prediction)

    print('Validation decision tree r2 =', r2_decisionTree)
    print('Validation gbt r2 =', r2_gbt)
    print('Validation random forest r2 =', r2_randomForest)

    print('Validation decision tree rmse =', rmse_decisionTree)
    print('Validation gbt rmse =', rmse_gbt)
    print('Validation random forest rmse =', rmse_randomForest)
Beispiel #26
0
def linearRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : linear_regression model [model]
    """
    #memanggil parameter (nilai default)
    featuresCol= conf["params"].get("featuresCol", "features")
    labelCol= conf["params"].get("labelCol", "label")
    predictionCol = conf["params"].get("predictionCol", "prediction")
        
    max_iter = conf["params"].get("maxIter", 100)
    reg_param = conf["params"].get("regParam", 0.0)
    elasticnet_param = conf["params"].get("elasticNetParam", 0.0)
    tol = conf["params"].get("tol", 1e-6)
    fitIntercept = conf["params"].get("fitIntercept", True)
    standardization = conf["params"].get("standardization", True)
    solver = conf["params"].get("solver", "auto")
    weightCol = conf["params"].get("weightCol", None)
    aggregationDepth = conf["params"].get("aggregationDepth", 2)
    loss = conf["params"].get("loss", "squaredError")
    epsilon =  conf["params"].get("epsilon", 1.35)        
        
    lr = LinearRegression(maxIter=max_iter, regParam=reg_param, 
                              elasticNetParam=elasticnet_param)
        
    print ("maxIter : " , lr.getMaxIter())
    print ("regParam : " , lr.getRegParam())
    print ("aggrDepth : " , lr.getAggregationDepth())
        
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
        #jika menggunakan ml-tuning cross validation  
        if conf["tuning"].get("method").lower() == "crossval":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
        #jika menggunakan ml-tuning train validation split
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = lr.fit(df)
          
    return model
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=124)

# Train a GBT model.
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(20)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = model.stages[1]
print(gbtModel)  # summary only
Beispiel #28
0
def gbtRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : decisiontree_regression model [model]
    """
    featuresCol = conf["params"].get("featuresCol")
    labelCol = conf["params"].get("labelCol")
    predictionCol=conf["params"].get("predictionCol")
    impurity = conf["params"].get("impurity", "variance")
    
    maxDepth    = conf["params"].get("maxDepth", 5)
    maxIter = conf["params"].get("maxIter", 20)
    maxBin = conf["params"].get("maxBins", 32)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB",256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    subsamplingRate= conf["params"].get("subsamplingRate", 1.0)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    lossType = conf["params"].get("lossType", "squared")
    seed = conf["params"].get("seed", None) 
    
    gbt = GBTRegressor(maxIter=maxIter, maxDepth=maxDepth, featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[featureIndexer, gbt])
    
    print ("maxDepth : " , gbt.getMaxDepth())
    print ("maxIter : ", gbt.getMaxIter())
    
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
          #jika menggunakan ml-tuning cross validation  
          if conf["tuning"].get("method").lower() == "crossval":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
          #jika menggunakan ml-tuning train validation split
          elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = pipeline.fit(df)
          
    return model
          itemCol='book_id',
          ratingCol='rating',
          coldStartStrategy='drop')

for rank in RANKS:
    for maxIter in MAX_ITERS:
        for regParam in REG_PARAMS:

            rank = int(rank)
            maxIter = int(maxIter)

            print("Running for " + str((rank, maxIter, regParam)))
            als.setParams(rank=rank, maxIter=maxIter, regParam=regParam)

            model = als.fit(interactions_train)
            # model.save(os.path.join(MODELS_DIRECTORY, 'als'))

            predictions = model.transform(interactions_val)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            print("Root-mean-square error = " + str(rmse))

            f = open("validation_errors.csv", "a+")
            f.write(
                str(rank) + "," + str(maxIter) + "," + str(regParam) + "," +
                str(rmse) + "\n")
            f.close()
            print("Finshed running for " + str((rank, maxIter, regParam)))
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

"""RMSE is meaningless without looking at the value of the medv variable, so let's take a look. """

train_df.describe().show()

"""Let's try some predictions using the test data set. First we'll find the R-Squared."""

lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction", "medv", "features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="medv", metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

"""And now the RMSE."""

test_result = lr_model.evaluate(test_df)
print('Root Mean Squared Error (RMSE) on test data = %g' % test_result.rootMeanSquaredError)

"""And now some analytics from the training along with residuals. """

print("Number of Iterations: %d" % trainingSummary.totalIterations)
print("Objective History %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show(5)

"""Here we will use this model to create some predictions."""