Esempio n. 1
0
    def test_fit_maximize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        bestModel = tvsModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
        validationMetrics = tvsModel.validationMetrics

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
        self.assertEqual(len(grid), len(validationMetrics),
                         "validationMetrics has the same size of grid parameter")
        self.assertEqual(1.0, max(validationMetrics))
Esempio n. 2
0
    def test_save_load_simple_estimator(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)

        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
        self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
Esempio n. 3
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        self.assertEqual(len(tvsModel.subModels), len(grid))

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testTrainValidationSplitSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        tvsModel.save(savingPathWithSubModels)
        tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
        self.assertEqual(len(tvsModel3.subModels), len(grid))
        tvsModel4 = tvsModel3.copy()
        self.assertEqual(len(tvsModel4.subModels), len(grid))

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
        self.assertEqual(tvsModel2.subModels, None)

        for i in range(len(grid)):
            self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
Esempio n. 4
0
    def test_copy(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        tvsCopied = tvs.copy()
        tvsModelCopied = tvsModel.copy()

        self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid,
                         "Copied TrainValidationSplit has the same uid of Estimator")

        self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid)
        self.assertEqual(len(tvsModel.validationMetrics),
                         len(tvsModelCopied.validationMetrics),
                         "Copied validationMetrics has the same size of the original")
        for index in range(len(tvsModel.validationMetrics)):
            self.assertEqual(tvsModel.validationMetrics[index],
                             tvsModelCopied.validationMetrics[index])
def build_model(training):
  #training = read_data()
  training.cache()
  
  columns = training.columns
  columns.remove("Occupancy")
  
  assembler = VectorAssembler(inputCols=columns, outputCol="featureVec")
  lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy")
  
  pipeline = Pipeline(stages=[assembler, lr])
  
  param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \
    .build()
  
  evaluator = BinaryClassificationEvaluator(labelCol="Occupancy")
  
  validator = TrainValidationSplit(estimator=pipeline,
                             estimatorParamMaps=param_grid,
                             evaluator=evaluator,
                             trainRatio=0.9)
  
  validator_model = validator.fit(training)
  return validator_model.bestModel
Esempio n. 6
0
 def test_parallel_evaluation(self):
     dataset = self.spark.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvs.setParallelism(1)
     tvsSerialModel = tvs.fit(dataset)
     tvs.setParallelism(2)
     tvsParallelModel = tvs.fit(dataset)
     self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
Esempio n. 7
0
    def test_save_load_nested_estimator(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(100)
        lr2 = LogisticRegression().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)

        originalParamMap = tvs.getEstimatorParamMaps()
        loadedParamMap = loadedTvs.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
    def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame([(Vectors.dense([0.0]), 0.0),
                                              (Vectors.dense([0.4]), 1.0),
                                              (Vectors.dense([0.5]), 0.0),
                                              (Vectors.dense([0.6]), 1.0),
                                              (Vectors.dense([1.0]), 1.0)] *
                                             10, ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr,
                                   estimatorParamMaps=grid,
                                   evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        lrModel = tvsModel.bestModel

        tvsModelPath = temp_path + "/tvsModel"
        lrModel.save(tvsModelPath)
        loadedLrModel = LogisticRegressionModel.load(tvsModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame([(Vectors.dense([0.0]), 0.0),
                                              (Vectors.dense([0.4]), 1.0),
                                              (Vectors.dense([0.5]), 0.0),
                                              (Vectors.dense([0.6]), 1.0),
                                              (Vectors.dense([1.0]), 1.0)] *
                                             10, ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr,
                                   estimatorParamMaps=grid,
                                   evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        self.assertEqual(len(tvsModel.subModels), len(grid))

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testTrainValidationSplitSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        tvsModel.save(savingPathWithSubModels)
        tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
        self.assertEqual(len(tvsModel3.subModels), len(grid))
        tvsModel4 = tvsModel3.copy()
        self.assertEqual(len(tvsModel4.subModels), len(grid))

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        tvsModel.write().option("persistSubModels",
                                "false").save(savingPathWithoutSubModels)
        tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
        self.assertEqual(tvsModel2.subModels, None)

        for i in range(len(grid)):
            self.assertEqual(tvsModel.subModels[i].uid,
                             tvsModel3.subModels[i].uid)
Esempio n. 10
0
  .addGrid(rf.maxDepth, [5,10,15])\
  .addGrid(rf.numTrees, [20,25,30])\
  .build()

# A TrainValidationSplit is used for hyper-parameter tuning. It takes a model estimator,
# parameter grid, and evaluator as input and runs the model multiple times to identify
# the most optimal model parameters
tvs = TrainValidationSplit(estimator=rf,
                           estimatorParamMaps=paramGrid,
                           evaluator=MulticlassClassificationEvaluator(),
                           trainRatio=0.8)

(trainingData, testData) = li.transform(va).randomSplit([0.7, 0.3])

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(trainingData)

predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

i2s.transform(predictions).groupBy('predictedLabel', 'maintenanceType')\
    .count().toPandas()
  
fi = model.bestModel.featureImportances.toArray()

sensorImportances = {}
for sensorIndex in range(len(fi)):
    sensorImportances[sensorNames[sensorIndex]] = round(fi[sensorIndex]*100)
    "lab ~ . + color:value1 + color:value2"])\
  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
  .addGrid(lr.regParam, [0.1, 2.0])\
  .build()

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
  .setMetricName("areaUnderROC")\
  .setRawPredictionCol("prediction")\
  .setLabelCol("label")

# COMMAND ----------

from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit()\
  .setTrainRatio(0.75)\
  .setEstimatorParamMaps(params)\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)

# COMMAND ----------

tvsFitted = tvs.fit(train)

# COMMAND ----------

evaluator.evaluate(tvsFitted.transform(test))
tvsFitted.write.overwrite().save("temp/ModelLocation")
Esempio n. 12
0
		scaler_model = scaler.fit(dataset)
		scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	else:
		scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2')
	dataset = scaler_model.transform(dataset)
	polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures')
	dataset = polyExpansion.transform(dataset)
	dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache()
	glr = None
	if args.mode == 'train':
		glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred')
		paramGrid = ParamGridBuilder() \
					.addGrid(glr.link, ['logit']) \
					.addGrid(glr.regParam, [1e-5]) \
					.build()
		tvs = TrainValidationSplit(estimator=glr, \
									estimatorParamMaps=paramGrid, \
									evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \
									trainRatio=0.7)
		tvs_model = tvs.fit(dataset)
		print('----> {}'.format(tvs_model.validationMetrics))
		if args.save_model:
			tvs_model.write().save('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2')
	else:
		#glr_model = GeneralizedLinearRegressionModel.load('/user/ronghui_safe/hgy/nid/models/glm_binomial_model')
		glr_model = TrainValidationSplitModel.load('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2')
		dataset = glr_model.transform(dataset).select(F.col('duration'), F.col('prediction'), F.col('key')).cache()
		if args.mode == 'eval':
			evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='duration', metricName='r2')
			print('----> The performance on the whole dataset is {}'.format(round(evaluator.evaluate(dataset), 4)))
		dataset.drop('duration').repartition(50).write.csv('/user/ronghui_safe/hgy/nid/weights/{}_{}'.format(args.query_month, args.mode), header=True)
Esempio n. 13
0
def get_als_model():
    ### Create our SparkSession, this can take a couple minutes locally
    spark = SparkSession.builder.appName("Review_data_JSON2").config(
        'spark.sql.broadcastTimeout', '34000').getOrCreate()

    ### Open the data from review.json
    df_reviews = spark.read.json("data_source/review.json")

    ### Take only 192.000 rows from the original review.json.
    ratingsRDD = df_reviews.select("user_id", "business_id",
                                   "stars").take(192000)

    df_reviews = spark.createDataFrame(ratingsRDD)

    columns_indexing = ["user_id", "business_id"]

    ### Using StringIndexer to create a category feature for user_id and business_id.
    indexers = [
        StringIndexer(inputCol=column,
                      outputCol=column + "_index").fit(df_reviews)
        for column in columns_indexing
    ]

    ### Creating a Pipeline to index two columns from the current dataset.
    pipeline = Pipeline(stages=indexers)

    ### Creating the new DataFrame after encoding user and business Id's.
    df_reviews_prepro = pipeline.fit(df_reviews).transform(df_reviews)

    ### Spliting in training, validation and test datasets.
    (training_review,
     test_review) = df_reviews_prepro.select("user_id_index",
                                             "business_id_index",
                                             "stars").randomSplit([0.8, 0.2])

    ### Creating our ALS prediction model.
    als_model = ALS(userCol="user_id_index",
                    itemCol="business_id_index",
                    ratingCol="stars",
                    coldStartStrategy="drop",
                    nonnegative=True)

    ### Tuning model
    param_grid = ParamGridBuilder()\
            .addGrid(als_model.rank, [12, 13, 14])\
            .addGrid(als_model.maxIter, [18, 19, 20])\
            .addGrid(als_model.regParam, [.17, .18, .19])\
            .build()

    ### Evaluate as Root Mean Squared Error
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="stars",
                                    predictionCol="prediction")

    ###
    tvs = TrainValidationSplit(estimator=als_model,
                               estimatorParamMaps=param_grid,
                               evaluator=evaluator)

    ### Training the model.
    model = tvs.fit(training_review)

    best_model = model.bestModel

    return best_model
Esempio n. 14
0
gbdt = GBTClassifier(labelCol='label', featuresCol='features');

#build param grid
paramGrid = ParamGridBuilder()\
.addGrid(gbdt.maxDepth, [6, 7, 8])\
.addGrid(gbdt.minInstancesPerNode, [200, 500, 800])\
.addGrid(gbdt.maxIter, [100, 120, 140])\
.addGrid(gbdt.stepSize, [0.04, 0.08])\
.addGrid(gbdt.subsamplingRate, [0.6, 0.8])\
.build();

#build train validation split
tvs = TrainValidationSplit(estimator=gbdt, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), trainRatio=0.8);

#train the model
model_sp = tvs.fit(trainData);

#predict
train_pred = model_sp.transform(trainData).select('label', 'prediction');
cv_pred = model_sp.transform(cvData).select('label', 'prediction');
test_pred = model_sp.transform(testData).select('label', 'prediction');

#convert spark df to pandas df
train_pred_pd = train_pred.toPandas();
cv_pred_pd = cv_pred.toPandas();
test_pred_pd = test_pred.toPandas();

#evaluate the f1 score
train_precision = metrics.precision_score(train_pred_pd.label.values, train_pred_pd.prediction.values);
train_recall = metrics.recall_score(train_pred_pd.label.values, train_pred_pd.prediction.values);
train_f1 = metrics.f1_score(train_pred_pd.label.values, train_pred_pd.prediction.values);
Esempio n. 15
0
# ## Specify the evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Class", metricName="areaUnderPR")


# ## Tuning the hyperparameters using holdout cross-validation

# For large DataFrames, holdout cross-validation will be more efficient.  Use
# the
# [TrainValidationSplit](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplit)
# class to specify holdout cross-validation:
from pyspark.ml.tuning import TrainValidationSplit
validator = TrainValidationSplit(estimator=rf, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.75, seed=54321)

# Use the `fit` method to find the best set of hyperparameters:
%time cv_model = validator.fit(df_train)

# **Note:** Our train DataFrame is split again according to `trainRatio`.

# The resulting model is an instance of the
# [TrainValidationSplitModel](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplitModel)
# class:
type(cv_model)

# The cross-validation results are stored in the `validationMetrics` attribute:
cv_model.validationMetrics

# Plotting Validation Metric for each set of hyperparameters (NumTrees).

def plot_holdout_results(model):
  plt.plot(numTreesList, model.validationMetrics)
def SVM(trainingData, testData):
    start_time = time.time()
    print(" ")
    print("--------------------- SUPPORT VECTOR MACHINE ---------------------")

    svm = LinearSVC()
    ovr = OneVsRest(classifier=svm)

    # Parametri su cui effettuare il tuning
    paramGrid = ParamGridBuilder() \
        .addGrid(svm.regParam, [1, 0]) \
        .addGrid(svm.maxIter, [100, 1000]) \
        .build()

    # Tuning sui vari parametri per scegliere il modello migliore
    tvs = TrainValidationSplit(estimator=ovr,
                               estimatorParamMaps=paramGrid,
                               evaluator=MulticlassClassificationEvaluator(),
                               # Validation test: 80% traning, 20% validation.
                               trainRatio=0.8)

    model = tvs.fit(trainingData)

    prediction = model.transform(testData)

    result = prediction.select('features', 'label', 'prediction')

    # Calcolo accuracy
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(prediction)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
    f1score = evaluator.evaluate(prediction)

    # Confusion Matrix
    class_temp = prediction.select("label").groupBy("label") \
        .count().sort('count', ascending=False).toPandas()
    class_temp = class_temp["label"].values.tolist()

    y_true = prediction.select("label")
    y_true = y_true.toPandas()

    y_pred = prediction.select("prediction")
    y_pred = y_pred.toPandas()

    cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp)


    print("Accuracy Hold-Out: ", accuracy)
    print("F1-Score Hold-Out: ", f1score)
    print("")
    print("")
    print("Doc Parameters : [", model.explainParams(), "]")
    print("")
    print("")
    print("Confusion Matrix: ")
    print(cnf_matrix)
    print("SVM HoldOut Execution TIME:", time.time() - start_time)

    # Richiamo SVM che utilizza la validazione K-Folds
    f1score_cv, cnf_matrix_cv, cv = SVMCV(trainingData, testData)

    # Restituisco il modello migliore tra Hold Out e K-Folds
    if (f1score <= f1score_cv):
        return (f1score_cv, cnf_matrix_cv, cv)
    else:
        return (f1score, cnf_matrix, tvs)
"""RMSE for basic model after resolving cold start problem is 0.92"""

als = ALS(userCol="userid", itemCol="itemid", ratingCol="rating",coldStartStrategy="drop", nonnegative=True)

#Tuning model using ParamGridBuilder
param_grid=ParamGridBuilder()\
            .addGrid(als.rank,(12,13,14))\
            .addGrid(als.maxIter,(5,10,15))\
            .addGrid(als.regParam,[0.01,0.05,0.10])\
            .build()

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
#CrossValidation
tvs=TrainValidationSplit(estimator=als,estimatorParamMaps=param_grid,evaluator=evaluator)
model = tvs.fit(training)          

best_model=model.bestModel
predictions = best_model.transform(test)

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

"""Root-mean-square error = 0.916

Improving performance by cross validation
"""

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

als = ALS(userCol="userid", itemCol="itemid", ratingCol="rating",coldStartStrategy="drop", nonnegative=True)
Esempio n. 18
0
# En muchos casos, holdout cross-validation will be sufficient. Usar la clase
# [TrainValidationSplit](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplit)
# para especificar  holdout cross-validation:
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.75, seed=54321)

# material teorico:
# [TrainValidationSplit](https://es.wikipedia.org/wiki/Validacion_cruzada)

# Para cada combinacion de hiperparametros la regresion linear sera, 
# entrenada con un set aleatorio de 75% de registros para entrenamiento llenando el DataFrame `train` 
# y luego evaluado sobre el 25%. 

# usar el metodo  `fit` para encontrar el mejor conjunto de parametros:
%time tvs_model = tvs.fit(train)


# El resultado es una instancia de la clase 
# [TrainValidationSplitModel](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplitModel)
#:
type(tvs_model)

# Los resultados de validación cruzada se almacenan en el atributo `validationMetrics`:

tvs_model.validationMetrics

# Estos son los RMSE para cada conjunto de hiperparámetros. Más pequeño es mejor.

def plot_holdout_results(model):
  plt.plot(regParamList, model.validationMetrics)
Esempio n. 19
0
def gbtRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : decisiontree_regression model [model]
    """
    featuresCol = conf["params"].get("featuresCol")
    labelCol = conf["params"].get("labelCol")
    predictionCol=conf["params"].get("predictionCol")
    impurity = conf["params"].get("impurity", "variance")
    
    maxDepth    = conf["params"].get("maxDepth", 5)
    maxIter = conf["params"].get("maxIter", 20)
    maxBin = conf["params"].get("maxBins", 32)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB",256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    subsamplingRate= conf["params"].get("subsamplingRate", 1.0)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    lossType = conf["params"].get("lossType", "squared")
    seed = conf["params"].get("seed", None) 
    
    gbt = GBTRegressor(maxIter=maxIter, maxDepth=maxDepth, featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[featureIndexer, gbt])
    
    print ("maxDepth : " , gbt.getMaxDepth())
    print ("maxIter : ", gbt.getMaxIter())
    
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
          #jika menggunakan ml-tuning cross validation  
          if conf["tuning"].get("method").lower() == "crossval":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
          #jika menggunakan ml-tuning train validation split
          elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = pipeline.fit(df)
          
    return model
Esempio n. 20
0
    #########################
    param_grid = ParamGridBuilder() \
        .addGrid(lr.regParam, [1.0, 0.1, 0.01, 0.001]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
        .build()

    #######################################
    # Hyperparameter Tuning - Grid Search #
    #######################################
    t_0 = time.time()
    train_val = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=param_grid,
        evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'),
        trainRatio=0.8)
    model = train_val.fit(train_df)

    print(model.bestModel.stages[-1].explainParam('regParam'))
    print(model.bestModel.stages[-1].explainParam('elasticNetParam'))
    print('Grid search took: {} seconds'.format(time.time() - t_0))

    #################
    # Model Metrics #
    #################
    t_0 = time.time()
    predictions = model.transform(test_df)
    print('Model training took: {} seconds'.format(time.time() - t_0))

    evaluator = BinaryClassificationEvaluator()

    auroc = evaluator.evaluate(predictions,
Esempio n. 21
0
def linearRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : linear_regression model [model]
    """
    #memanggil parameter (nilai default)
    featuresCol= conf["params"].get("featuresCol", "features")
    labelCol= conf["params"].get("labelCol", "label")
    predictionCol = conf["params"].get("predictionCol", "prediction")
        
    max_iter = conf["params"].get("maxIter", 100)
    reg_param = conf["params"].get("regParam", 0.0)
    elasticnet_param = conf["params"].get("elasticNetParam", 0.0)
    tol = conf["params"].get("tol", 1e-6)
    fitIntercept = conf["params"].get("fitIntercept", True)
    standardization = conf["params"].get("standardization", True)
    solver = conf["params"].get("solver", "auto")
    weightCol = conf["params"].get("weightCol", None)
    aggregationDepth = conf["params"].get("aggregationDepth", 2)
    loss = conf["params"].get("loss", "squaredError")
    epsilon =  conf["params"].get("epsilon", 1.35)        
        
    lr = LinearRegression(maxIter=max_iter, regParam=reg_param, 
                              elasticNetParam=elasticnet_param)
        
    print ("maxIter : " , lr.getMaxIter())
    print ("regParam : " , lr.getRegParam())
    print ("aggrDepth : " , lr.getAggregationDepth())
        
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
        #jika menggunakan ml-tuning cross validation  
        if conf["tuning"].get("method").lower() == "crossval":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
        #jika menggunakan ml-tuning train validation split
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = lr.fit(df)
          
    return model
paramGrid = (ParamGridBuilder().addGrid(dt.maxDepth, [1, 2, 6]).addGrid(
    dt.maxBins, [20, 40]).build())

# COMMAND ----------

# MAGIC %md ### To build a general model, _TrainValidationSplit_ is used by us

# COMMAND ----------

dt_tvs = TrainValidationSplit(estimator=dtp,
                              evaluator=MulticlassClassificationEvaluator(),
                              estimatorParamMaps=paramGrid,
                              trainRatio=0.8)

dtModel = dt_tvs.fit(train)

# COMMAND ----------

# MAGIC %md ### Test the Recommender
# MAGIC Now that we've trained the recommender, lets see how accurately it predicts known stars in the test set.

# COMMAND ----------

prediction = dtModel.transform(test)
predicted = prediction.select("features", "prediction", "trueLabel")
predicted.show(10)

# COMMAND ----------

# MAGIC %md ##TP, FP, TN, and FN all calculated
Esempio n. 23
0
    # [0.1, 0.05, 0.01]) \
    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures,[1000]) \
        .addGrid(lr.regParam, [0.1]) \
        .build()

    crossval = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=BinaryClassificationEvaluator().setMetricName(
            'areaUnderPR'
        ),  # set area Under precision-recall curve as the evaluation metric
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    cvModel = crossval.fit(training_spark_df_binary)
    cvModel.bestModel.save("model")
    # Make predictions
    train_prediction = cvModel.transform(training_spark_df_binary)
    test_prediction = cvModel.transform(testing_spark_df_binary)
    otherDatasetTest = cvModel.transform(otherDatasetTest_df_binary)

    pd_prediction = test_prediction.select("*").toPandas()
    actual = pd_prediction["label"].tolist()
    pred = pd_prediction["prediction"].tolist()

    pd_prediction_other_dataset = otherDatasetTest.select("*").toPandas()
    actual_otherdataset = pd_prediction_other_dataset["label"].tolist()
    pred_otherdataset = pd_prediction_other_dataset["prediction"].tolist()

    tn, fp, fn, tp = confusion_matrix(actual, pred).ravel()
Esempio n. 24
0
# building model
als = ALS(nonnegative=True, checkpointInterval=3, coldStartStrategy="drop")
paramGrid = ParamGridBuilder()\
    .addGrid(als.rank, [5, 30, 70])\
    .addGrid(als.regParam, [0.1, 1, 10])\
    .build()


rmse = RegressionEvaluator(metricName="rmse", labelCol="rating")
# trainRatio makes train:0.5 valid:0.25 and test:0.25
tvs = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=paramGrid,
    evaluator=rmse,
    seed=seed,
    trainRatio=0.66,
    parallelism=3
)


model = tvs.fit(dftrain)
model.transform(dftrain).show()


testPred = model.transform(dftest)
testPred.show(5)
rmse.evaluate(testPred)


model_path = os.getcwd() + '/ALS_model2'
model.save(model_path)
Esempio n. 25
0
    EN = LinearRegression(labelCol = labelCol,
                          featuresCol = 'features',
                          fitIntercept=True,
                          standardization=False)

    EN_paramGrid = ParamGridBuilder().addGrid(EN.regParam, [10,1,0.1, 0.01,0.001])\
                                     .addGrid(EN.elasticNetParam, [0.0, 0.5, 1.0])\
                                     .build()

    EN_tvs = TrainValidationSplit(estimator=EN,
                                  estimatorParamMaps=EN_paramGrid,
                                  evaluator=RegressionEvaluator(labelCol=labelCol),
                                  # 80% of the data will be used for training, 20% for validation.
                                  trainRatio=0.8)

    EN_model = EN_tvs.fit(train)

    EN_model.save("s3://buj201-two-sigma-challenge/EN_model")

    GBR = GBTRegressor(labelCol=labelCol, lossType="squared")¶

    GBR_paramGrid = ParamGridBuilder().addGrid(BGR.maxDepth, [2,4,6])\
                                      .addGrid(BGR.maxIter, [50,100,200])\
                                      .addGrid(BGR.stepSize, [0.01,0.1,0.3])\
                                      .build()

    GBR_tvs = TrainValidationSplit(estimator=GBR,
                                   estimatorParamMaps=GBR_paramGrid,
                                   evaluator=RegressionEvaluator(labelCol=labelCol),
                                   # 80% of the data will be used for training, 20% for validation.
                                   trainRatio=0.8)
Esempio n. 26
0
        "s3://rtl-databricks-datascience/lpater/processed_data/bids_train.parquet/"
    ).groupBy("deal_id").count().orderBy(
        'count', ascending=False).select("deal_id").toPandas()["deal_id"]
)  #create a list of deal_ids to select from, ordered by how common they are

# COMMAND ----------

#create the objects

tree = DecisionTreeClassifier()

paramGrid = ParamGridBuilder()\
    .addGrid(tree.maxDepth, [4, 5, 6, 7, 8]) \
    .build()

tvs = TrainValidationSplit(
    estimator=tree,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR"),
    # 80% of the data will be used for training, 20% for validation.
    trainRatio=0.8)

# COMMAND ----------

# Run TrainValidationSplit, and choose the best regularization paramer.
for deal_id in deal_ids:
    training_data = market_train.withColumnRenamed(deal_id, 'label')
    #testing_data = market_test.withColumnRenamed(deal_id,'label')
    model = tvs.fit(training_data)
    #print({"accuracy" : model.summary.accuracy})
    #print({variable_names[variable_number] : model.coefficients[variable_number.item()] for variable_number in model.coefficients.indices})
Esempio n. 27
0
        .load("data/mllib/sample_linear_regression_data.txt")
    train, test = data.randomSplit([0.7, 0.3])
    lr = LinearRegression(maxIter=10, regParam=0.1)

    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # TrainValidationSplit will try all combinations of values and determine best model using
    # the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
        .build()

    # In this case the estimator is simply the linear regression.
    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = TrainValidationSplit(
        estimator=lr,
        estimatorParamMaps=paramGrid,
        evaluator=RegressionEvaluator(),
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train)
    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    prediction = model.transform(test)
    for row in prediction.take(5):
        print(row)
    # $example off$
    spark.stop()
def generalizedLinearRegressor(dataFrame, conf):
    """
        input: df [spark.dataFrame], conf [configuration params]
        output: generalized linear regression model [model]
    """

    # calling params
    label_col = conf["params"].get("labelCol", "label")
    features_col = conf["params"].get("featuresCol", "features")
    prediction_col = conf["params"].get("predictionCol", "prediction")
    fam = conf["params"].get("family", "gaussian")

    fit_intercept = conf["params"].get("fitIntercept", True)
    max_iter = conf["params"].get("maxIter", 25)
    tolp = conf["params"].get("tol", 1e-6)
    reg_param = conf["params"].get("regParam", 0.0)
    weight_col = conf["params"].get("weightCol", None)
    solverp = conf["params"].get("solver", "irls")
    link_prediction_col = conf["params"].get("linkPredictionCol", None)
    variance_power = conf["params"].get("variancePower", 0.0)
    link_power = conf["params"].get("linkPower", None)

    if (fam == "gaussian"):
        li = conf["params"].get("link", "identity")
    elif (fam == "binomial"):
        li = conf["params"].get("link", "logit")
    elif (fam == "poisson"):
        li = conf["params"].get("link", "log")
    elif (fam == "gamma"):
        li = conf["params"].get("link", "inverse")
    elif (fam == "tweedle"):
        li = conf["params"].get("link", 1 - variance_power)
    else:
        li = conf["params"].get("link", None)

    glr = GeneralizedLinearRegression(labelCol=label_col,
                                      featuresCol=features_col,
                                      predictionCol=prediction_col,
                                      family=fam,
                                      link=li,
                                      fitIntercept=fit_intercept,
                                      maxIter=max_iter,
                                      tol=tolp,
                                      regParam=reg_param,
                                      solver=solverp,
                                      linkPredictionCol=link_prediction_col,
                                      variancePower=variance_power,
                                      linkPower=link_power)

    # with tuning
    if conf["tuning"]:
        # method: cross validation
        if conf["tuning"].get("method").lower() == "crossval":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
                pg.addGrid(key, paramGrids[key])

            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=glr,
                                estimatorParamMaps=grid,
                                evaluator=evaluator,
                                numFolds=folds)
            model = cv.fit(dataFrame)

        # method: train validation split
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
                pg.addGrid(key, paramGrids[key])

            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=glr,
                                       estimatorParamMaps=grid,
                                       evaluator=evaluator,
                                       trainRatio=tr)
            model = tvs.fit(dataFrame)

    # without tuning
    else:
        model = glr.fit(dataFrame)

    return model
Esempio n. 29
0
    def test_save_load_pipeline_estimator(self):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0),
            (4, "b spark who", 1.0),
            (5, "g d a y", 0.0),
            (6, "spark fly", 1.0),
            (7, "was mapreduce", 0.0),
        ], ["id", "text", "label"])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(5)
        lr2 = LogisticRegression().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100]) \
            .addGrid(ova.classifier, [lr1, lr2]) \
            .build()

        tvs = TrainValidationSplit(estimator=pipeline,
                                   estimatorParamMaps=paramGrid,
                                   evaluator=MulticlassClassificationEvaluator())
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel = tvs.fit(training)

        # test save/load of CrossValidatorModel
        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              tvsModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        tvs2 = TrainValidationSplit(estimator=nested_pipeline,
                                    estimatorParamMaps=paramGrid,
                                    evaluator=MulticlassClassificationEvaluator())
        tvs2Path = temp_path + "/tvs2"
        tvs2.save(tvs2Path)
        loadedTvs2 = TrainValidationSplit.load(tvs2Path)
        self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid)
        self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel2 = tvs2.fit(training)
        # test save/load of CrossValidatorModel
        tvsModelPath2 = temp_path + "/tvsModel2"
        tvsModel2.save(tvsModelPath2)
        loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = tvsModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(loaded_nested_pipeline_model.stages,
                                              original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)
  .addGrid(lr.regParam, [0.1, 2.0])\
  .build()


# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
  .setMetricName("areaUnderROC")\
  .setRawPredictionCol("prediction")\
  .setLabelCol("label")


# COMMAND ----------

from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit()\
  .setTrainRatio(0.75)\
  .setEstimatorParamMaps(params)\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)


# COMMAND ----------

tvsFitted = tvs.fit(train)


# COMMAND ----------

Esempio n. 31
0
lr = LogisticRegression(maxIter=10, regParam=0.01)

paramMap = ({lr.regParam: 0.1, lr.threshold: 0.55, lr.maxIter: 100, })

paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.threshold, [0.51, 0.56])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=MulticlassClassificationEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

model = tvs.fit(X_y) # lr.fit(train, paramMap)

################################################TESTING_MODEL###############################################################
print('*' * 50, 'TESTING_MODEL', '*' * 50)
predictions = model.transform(test)
result = predictions.select("features", "label", "prediction").collect()
for row in result:
    print("features=%s, label=%s -> prediction=%s"
          % (row.features, row.label, row.prediction))

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Acc = %g " % (accuracy))
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "initialization",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = [
            x for x in algosToRun if x.get_algorithm_slug() == self._slug
        ][0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()

        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})

        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns) - set(allDateCols))
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        appType = self._dataframe_context.get_app_type()

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print("model_path", model_path)
        pipeline_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/pipeline/"
        model_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/model"
        pmml_filepath = "file://" + str(model_path) + "/" + str(
            self._slug) + "/modelPmml"

        df = self._data_frame
        levels = df.select(result_column).distinct().count()

        appType = self._dataframe_context.get_app_type()

        model_filepath = model_path + "/" + self._slug + "/model"
        pmml_filepath = str(model_path) + "/" + str(
            self._slug) + "/traindeModel.pmml"

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "training",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        st = time.time()
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)

        trainingData, validationData = MLUtils.get_training_and_validation_data(
            df, result_column, 0.8)  # indexed

        labelIndexer = StringIndexer(inputCol=result_column, outputCol="label")
        # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")

        # Label Mapping and Inverse
        labelIdx = labelIndexer.fit(trainingData)
        labelMapping = {k: v for k, v in enumerate(labelIdx.labels)}
        inverseLabelMapping = {
            v: float(k)
            for k, v in enumerate(labelIdx.labels)
        }
        if self._dataframe_context.get_trainerMode() == "autoML":
            automl_enable = True
        else:
            automl_enable = False
        clf = NaiveBayes()
        if not algoSetting.is_hyperparameter_tuning_enabled():
            algoParams = algoSetting.get_params_dict()
        else:
            algoParams = algoSetting.get_params_dict_hyperparameter()
        print("=" * 100)
        print(algoParams)
        print("=" * 100)
        clfParams = [prm.name for prm in clf.params]
        algoParams = {
            getattr(clf, k): v if isinstance(v, list) else [v]
            for k, v in algoParams.items() if k in clfParams
        }
        #print("="*100)
        #print("ALGOPARAMS - ",algoParams)
        #print("="*100)

        paramGrid = ParamGridBuilder()
        # if not algoSetting.is_hyperparameter_tuning_enabled():
        #     for k,v in algoParams.items():
        #         if v == [None] * len(v):
        #             continue
        #         if k.name == 'thresholds':
        #             paramGrid = paramGrid.addGrid(k,v[0])
        #         else:
        #             paramGrid = paramGrid.addGrid(k,v)
        #     paramGrid = paramGrid.build()

        # if not algoSetting.is_hyperparameter_tuning_enabled():
        for k, v in algoParams.items():
            print(k, v)
            if v == [None] * len(v):
                continue
            paramGrid = paramGrid.addGrid(k, v)
        paramGrid = paramGrid.build()
        # else:
        #     for k,v in algoParams.items():
        #         print k.name, v
        #         if v[0] == [None] * len(v[0]):
        #             continue
        #         paramGrid = paramGrid.addGrid(k,v[0])
        #     paramGrid = paramGrid.build()

        #print("="*143)
        #print("PARAMGRID - ", paramGrid)
        #print("="*143)

        if len(paramGrid) > 1:
            hyperParamInitParam = algoSetting.get_hyperparameter_params()
            evaluationMetricDict = {
                "name": hyperParamInitParam["evaluationMetric"]
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]
        else:
            evaluationMetricDict = {
                "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC
            }
            evaluationMetricDict[
                "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[
                    evaluationMetricDict["name"]]

        self._result_setter.set_hyper_parameter_results(self._slug, None)

        if validationDict["name"] == "kFold":
            numFold = int(validationDict["value"])
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkGridSearchResult(
                    estimator, paramGrid, appType, modelFilepath, levels,
                    evaluationMetricDict, trainingData, validationData,
                    numFold, self._targetLevel, labelMapping,
                    inverseLabelMapping, df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models(
                )
                self._result_setter.set_hyper_parameter_results(
                    self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(
                    self._slug, {
                        "ignoreList":
                        pySparkHyperParameterResultObj.get_ignore_list(),
                        "hideColumns":
                        pySparkHyperParameterResultObj.get_hide_columns(),
                        "metricColName":
                        pySparkHyperParameterResultObj.
                        get_comparison_metric_colname(),
                        "columnOrder":
                        pySparkHyperParameterResultObj.get_keep_columns()
                    })

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()

            else:
                if automl_enable:
                    paramGrid = (ParamGridBuilder().addGrid(
                        clf.smoothing, [1.0, 0.2]).build())
                crossval = CrossValidator(
                    estimator=estimator,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator()
                    if levels == 2 else MulticlassClassificationEvaluator(),
                    numFolds=3 if numFold is None else
                    numFold)  # use 3+ folds in practice
                cvnb = crossval.fit(trainingData)
                prediction = cvnb.transform(validationData)
                bestModel = cvnb.bestModel

        else:
            train_test_ratio = float(
                self._dataframe_context.get_train_test_split())
            estimator = Pipeline(stages=[pipeline, labelIndexer, clf])
            if algoSetting.is_hyperparameter_tuning_enabled():
                modelFilepath = "/".join(model_filepath.split("/")[:-1])
                pySparkHyperParameterResultObj = PySparkTrainTestResult(
                    estimator, paramGrid, appType, modelFilepath, levels,
                    evaluationMetricDict, trainingData, validationData,
                    train_test_ratio, self._targetLevel, labelMapping,
                    inverseLabelMapping, df)
                resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models(
                )
                self._result_setter.set_hyper_parameter_results(
                    self._slug, resultArray)
                self._result_setter.set_metadata_parallel_coordinates(
                    self._slug, {
                        "ignoreList":
                        pySparkHyperParameterResultObj.get_ignore_list(),
                        "hideColumns":
                        pySparkHyperParameterResultObj.get_hide_columns(),
                        "metricColName":
                        pySparkHyperParameterResultObj.
                        get_comparison_metric_colname(),
                        "columnOrder":
                        pySparkHyperParameterResultObj.get_keep_columns()
                    })

                bestModel = pySparkHyperParameterResultObj.getBestModel()
                prediction = pySparkHyperParameterResultObj.getBestPrediction()

            else:
                tvs = TrainValidationSplit(
                    estimator=estimator,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator()
                    if levels == 2 else MulticlassClassificationEvaluator(),
                    trainRatio=train_test_ratio)

                tvspnb = tvs.fit(trainingData)
                prediction = tvspnb.transform(validationData)
                bestModel = tvspnb.bestModel

        modelmanagement_ = {
            param[0].name: param[1]
            for param in bestModel.stages[2].extractParamMap().items()
        }

        MLUtils.save_pipeline_or_model(bestModel, model_filepath)
        predsAndLabels = prediction.select(['prediction',
                                            'label']).rdd.map(tuple)
        # label_classes = prediction.select("label").distinct().collect()
        # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label']
        #results = transformed.select(["prediction","label"])
        # if len(label_classes) > 2:
        #     metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model
        # else:
        #     metrics = BinaryClassificationMetrics(predsAndLabels)
        posLabel = inverseLabelMapping[self._targetLevel]
        metrics = MulticlassMetrics(predsAndLabels)

        trainingTime = time.time() - st

        f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel],
                                    1.0)
        precision = metrics.precision(inverseLabelMapping[self._targetLevel])
        recall = metrics.recall(inverseLabelMapping[self._targetLevel])
        accuracy = metrics.accuracy

        print(f1_score, precision, recall, accuracy)

        #gain chart implementation
        def cal_prob_eval(x):
            if len(x) == 1:
                if x == posLabel:
                    return (float(x[1]))
                else:
                    return (float(1 - x[1]))
            else:
                return (float(x[int(posLabel)]))

        column_name = 'probability'

        def y_prob_for_eval_udf():
            return udf(lambda x: cal_prob_eval(x))

        prediction = prediction.withColumn(
            "y_prob_for_eval",
            y_prob_for_eval_udf()(col(column_name)))

        try:
            pys_df = prediction.select(
                ['y_prob_for_eval', 'prediction', 'label'])
            gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval',
                                          'prediction', 'label', posLabel,
                                          self._spark)
            gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas()
        except:
            try:
                temp_df = pys_df.toPandas()
                gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval',
                                              'prediction', 'label', posLabel,
                                              self._spark)
                gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering()
            except:
                print("gain chant failed")
                gain_lift_KS_dataframe = None

        #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns)
        act_list = prediction.select('label').collect()
        actual = [int(row.label) for row in act_list]

        pred_list = prediction.select('prediction').collect()
        predicted = [int(row.prediction) for row in pred_list]
        prob_list = prediction.select('probability').collect()
        probability = [list(row.probability) for row in prob_list]
        # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'),
        # "probability":prediction.select('probability'),"feature_importance":None,
        # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping}
        objs = {
            "trained_model": bestModel,
            "actual": actual,
            "predicted": predicted,
            "probability": probability,
            "feature_importance": None,
            "featureList": list(categorical_columns) + list(numerical_columns),
            "labelMapping": labelMapping
        }

        conf_mat_ar = metrics.confusionMatrix().toArray()
        print(conf_mat_ar)
        confusion_matrix = {}
        for i in range(len(conf_mat_ar)):
            confusion_matrix[labelMapping[i]] = {}
            for j, val in enumerate(conf_mat_ar[i]):
                confusion_matrix[labelMapping[i]][labelMapping[j]] = val
        print(confusion_matrix)  # accuracy of the model
        '''ROC CURVE IMPLEMENTATION'''
        y_prob = probability
        y_score = predicted
        y_test = actual
        logLoss = log_loss(y_test, y_prob)
        if levels <= 2:
            positive_label_probs = []
            for val in y_prob:
                positive_label_probs.append(val[int(posLabel)])
            roc_auc = roc_auc_score(y_test, y_score)

            roc_data_dict = {
                "y_score": y_score,
                "y_test": y_test,
                "positive_label_probs": positive_label_probs,
                "y_prob": y_prob,
                "positive_label": posLabel
            }
            roc_dataframe = pd.DataFrame({
                "y_score":
                y_score,
                "y_test":
                y_test,
                "positive_label_probs":
                positive_label_probs
            })
            #roc_dataframe.to_csv("binary_roc_data.csv")
            fpr, tpr, thresholds = roc_curve(y_test,
                                             positive_label_probs,
                                             pos_label=posLabel)
            roc_df = pd.DataFrame({
                "FPR": fpr,
                "TPR": tpr,
                "thresholds": thresholds
            })
            roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"]

            optimal_index = np.argmax(np.array(roc_df["tpr-fpr"]))
            fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"]
            tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"]

            rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4})

            unique_fpr = rounded_roc_df["FPR"].unique()

            final_roc_df = rounded_roc_df.groupby("FPR",
                                                  as_index=False)[["TPR"
                                                                   ]].mean()
            endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3})
        elif levels > 2:
            positive_label_probs = []
            for val in y_prob:
                positive_label_probs.append(val[int(posLabel)])

            y_test_roc_multi = []
            for val in y_test:
                if val != posLabel:
                    val = posLabel + 1
                    y_test_roc_multi.append(val)
                else:
                    y_test_roc_multi.append(val)

            y_score_roc_multi = []
            for val in y_score:
                if val != posLabel:
                    val = posLabel + 1
                    y_score_roc_multi.append(val)
                else:
                    y_score_roc_multi.append(val)

            roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi)

            fpr, tpr, thresholds = roc_curve(y_test_roc_multi,
                                             positive_label_probs,
                                             pos_label=posLabel)
            roc_df = pd.DataFrame({
                "FPR": fpr,
                "TPR": tpr,
                "thresholds": thresholds
            })
            roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"]

            optimal_index = np.argmax(np.array(roc_df["tpr-fpr"]))
            fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"]
            tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"]

            rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4})
            unique_fpr = rounded_roc_df["FPR"].unique()
            final_roc_df = rounded_roc_df.groupby("FPR",
                                                  as_index=False)[["TPR"
                                                                   ]].mean()
            endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3})
        # Calculating prediction_split
        val_cnts = prediction.groupBy('label').count()
        val_cnts = map(lambda row: row.asDict(), val_cnts.collect())
        prediction_split = {}
        total_nos = prediction.select('label').count()
        for item in val_cnts:
            print(labelMapping)
            classname = labelMapping[item['label']]
            prediction_split[classname] = round(
                item['count'] * 100 / float(total_nos), 2)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH -
                                     1) + "1"
            modelFilepathArr = model_filepath.split("/")[:-1]
            modelFilepathArr.append(modelName)
            bestModel.save("/".join(modelFilepathArr))
        runtime = round((time.time() - st_global), 2)

        try:
            print(pmml_filepath)
            pmmlBuilder = PMMLBuilder(self._spark, trainingData,
                                      bestModel).putOption(
                                          clf, 'compact', True)
            pmmlBuilder.buildFile(pmml_filepath)
            pmmlfile = open(pmml_filepath, "r")
            pmmlText = pmmlfile.read()
            pmmlfile.close()
            self._result_setter.update_pmml_object({self._slug: pmmlText})
        except Exception as e:
            print("PMML failed...", str(e))
            pass

        cat_cols = list(set(categorical_columns) - {result_column})
        self._model_summary = MLModelSummary()
        self._model_summary.set_algorithm_name("Naive Bayes")
        self._model_summary.set_algorithm_display_name("Naive Bayes")
        self._model_summary.set_slug(self._slug)
        self._model_summary.set_training_time(runtime)
        self._model_summary.set_confusion_matrix(confusion_matrix)
        # self._model_summary.set_feature_importance(objs["feature_importance"])
        self._model_summary.set_feature_list(objs["featureList"])
        self._model_summary.set_model_accuracy(accuracy)
        self._model_summary.set_training_time(round((time.time() - st), 2))
        self._model_summary.set_precision_recall_stats([precision, recall])
        self._model_summary.set_model_precision(precision)
        self._model_summary.set_model_recall(recall)
        self._model_summary.set_model_F1_score(f1_score)
        self._model_summary.set_model_log_loss(logLoss)
        self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe)
        self._model_summary.set_AUC_score(roc_auc)
        self._model_summary.set_target_variable(result_column)
        self._model_summary.set_prediction_split(prediction_split)
        self._model_summary.set_validation_method("KFold")
        self._model_summary.set_level_map_dict(objs["labelMapping"])
        # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column])))
        self._model_summary.set_model_features(objs["featureList"])
        self._model_summary.set_level_counts(
            self._metaParser.get_unique_level_dict(
                list(set(categorical_columns)) + [result_column]))
        #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees)
        self._model_summary.set_num_rules(300)
        self._model_summary.set_target_level(self._targetLevel)

        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": accuracy,
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": modelName
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                "name": self._model_summary.get_algorithm_name(),
                "evaluationMetricValue": accuracy,
                "evaluationMetricName": "accuracy",
                "slug": self._model_summary.get_slug(),
                "Model Id": resultArray[0]["Model Id"]
            }
            modelSummaryJson = {
                "dropdown": modelDropDownObj,
                "levelcount": self._model_summary.get_level_counts(),
                "modelFeatureList": self._model_summary.get_feature_list(),
                "levelMapping": self._model_summary.get_level_map_dict(),
                "slug": self._model_summary.get_slug(),
                "name": self._model_summary.get_algorithm_name()
            }
        self._model_management = MLModelSummary()
        print(modelmanagement_)
        self._model_management.set_job_type(
            self._dataframe_context.get_job_name())  #Project name
        self._model_management.set_training_status(
            data="completed")  # training status
        self._model_management.set_target_level(
            self._targetLevel)  # target column value
        self._model_management.set_training_time(runtime)  # run time
        self._model_management.set_model_accuracy(round(metrics.accuracy, 2))
        # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy
        self._model_management.set_algorithm_name(
            "NaiveBayes")  #algorithm name
        self._model_management.set_validation_method(
            str(validationDict["displayName"]) + "(" +
            str(validationDict["value"]) + ")")  #validation method
        self._model_management.set_target_variable(
            result_column)  #target column name
        self._model_management.set_creation_date(data=str(
            datetime.now().strftime('%b %d ,%Y  %H:%M ')))  #creation date
        self._model_management.set_datasetName(self._datasetName)
        self._model_management.set_model_type(data='classification')
        self._model_management.set_var_smoothing(
            data=int(modelmanagement_['smoothing']))

        # self._model_management.set_no_of_independent_variables(df) #no of independent varables

        modelManagementSummaryJson = [
            ["Project Name",
             self._model_management.get_job_type()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            ["Training Status",
             self._model_management.get_training_status()],
            ["Accuracy",
             self._model_management.get_model_accuracy()],
            ["RunTime", self._model_management.get_training_time()],
            #["Owner",None],
            ["Created On",
             self._model_management.get_creation_date()]
        ]

        modelManagementModelSettingsJson = [
            ["Training Dataset",
             self._model_management.get_datasetName()],
            ["Target Column",
             self._model_management.get_target_variable()],
            ["Target Column Value",
             self._model_management.get_target_level()],
            ["Algorithm",
             self._model_management.get_algorithm_name()],
            [
                "Model Validation",
                self._model_management.get_validation_method()
            ],
            ["Model Type",
             self._model_management.get_model_type()],
            ["Smoothing",
             self._model_management.get_var_smoothing()],

            #,["priors",self._model_management.get_priors()]
            #,["var_smoothing",self._model_management.get_var_smoothing()]
        ]

        nbOverviewCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_card_overview(
                self._model_management, modelManagementSummaryJson,
                modelManagementModelSettingsJson)
        ]
        nbPerformanceCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_cards(
                self._model_summary, endgame_roc_df)
        ]
        nbDeploymentCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj))
            for cardObj in MLUtils.create_model_management_deploy_empty_card()
        ]
        nbCards = [
            json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for
            cardObj in MLUtils.create_model_summary_cards(self._model_summary)
        ]
        NB_Overview_Node = NarrativesTree()
        NB_Overview_Node.set_name("Overview")
        NB_Performance_Node = NarrativesTree()
        NB_Performance_Node.set_name("Performance")
        NB_Deployment_Node = NarrativesTree()
        NB_Deployment_Node.set_name("Deployment")
        for card in nbOverviewCards:
            NB_Overview_Node.add_a_card(card)
        for card in nbPerformanceCards:
            NB_Performance_Node.add_a_card(card)
        for card in nbDeploymentCards:
            NB_Deployment_Node.add_a_card(card)
        for card in nbCards:
            self._prediction_narrative.add_a_card(card)

        self._result_setter.set_model_summary({
            "naivebayes":
            json.loads(
                CommonUtils.convert_python_object_to_json(self._model_summary))
        })
        self._result_setter.set_naive_bayes_model_summary(modelSummaryJson)
        self._result_setter.set_nb_cards(nbCards)
        self._result_setter.set_nb_nodes(
            [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node])
        self._result_setter.set_nb_fail_card({
            "Algorithm_Name": "Naive Bayes",
            "success": "True"
        })

        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._slug,
            "completion",
            "info",
            display=True,
            emptyBin=False,
            customMsg=None,
            weightKey="total")

        print("\n\n")
Esempio n. 33
0
    def hotmodel(self, sc, sets, movieRDD):
        '''
        training a super hot model
        '''
        als = ALS(coldStartStrategy="drop")
        param_grid = ParamGridBuilder() \
        .addGrid(als.rank, [6, 8]) \
        .addGrid(als.maxIter,[8, 10, 12]) \
        .build()

        evaluator = RegressionEvaluator(
            metricName="mse",
            labelCol="rating",
            predictionCol="prediction")

        tvs = TrainValidationSplit(
            estimator=als,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
        )

        model = tvs.fit(sets['training']) ## should we save the model?
        best_rank = model.bestModel.rank
        best_iterations = model.bestModel._java_obj.parent().getMaxIter()
        print('hotmodel part 1')

        prediction = model.transform(sets['test'])
        prediction.alias('p')\
            .join(movieRDD.alias('m'), col('p.item') == col('m.item'))\
                .select([col('p.user'), col('m.title'), col('p.prediction'), col('p.rating')])

        mse = evaluator.evaluate(prediction)
        print("MSE = {}".format(mse))

        '''
        hot model's tinder date
        '''
        rating59169 = [
                (118661, 9), # Avengers
                (371746, 9),  # Iron Man 2008
                (94625, 9),  # Akira
                (1563738, 2), # One day 2011
                (800369, 8),  # Thor
                (1981115, 9), # Thor: The Dark World
                (3501632, 9), # Thor: Ragnarok
                (120338, 3), # Titanic
                (98635, 2), # When Harry Met Sally
                (125439, 3), # Notting Hill
                (332280, 1) # The Notebook
            ]
        
        user59169 = ratingRDD.groupBy().max('user').first()['max(user)'] + 1
        user59169DF = spark.createDataFrame\
        ([Row(user=user59169, item=r[0], rating=r[1]) for r in rating59169])
        user59169DF = user59169DF.select('user','item','rating')
        # user59169DF = sc.parallelize(user59169DF)
        new_model = ALS(rank=best_rank, maxIter=best_iterations, coldStartStrategy="drop")\
            .fit(ratingRDD2)

        unseen_movies = movieRDD.alias('m')\
            .join(user59169DF.alias('r'), col('m.item') == col('r.item'), how='left_anti')\
                .select('item')
        unseen_movies_user = unseen_movies.withColumn("user", lit(user59169))

        print('hot model part 2')

        spark.conf.set("spark.sql.crossJoin.enabled", "true")
        unseen_ratings = new_model.transform(unseen_movies_user)

        unseen_ratings_titles = unseen_ratings.alias('r')\
                        .join(movieRDD.alias('m'), col('r.item') == col('m.item'))\
                        .select(['user', 'title', 'prediction'])

        ratings_per_movie = ratingRDD.groupBy('item').count()
        enough_ratings = ratings_per_movie.filter(col('count') < 500)
        enough_ratings.show()

        
        training_10 = unseen_ratings.alias('r')\
            .join(enough_ratings.alias('e'), col('r.item') == col('e.item'), how='left_anti')\
            .select(['item', 'user', 'prediction']).orderBy(col('prediction').desc())

        training_100.alias('t').join(movieRDD.alias('m'), col('t.item') == col('m.item'))\
            .select(['user', 'title', 'prediction'])\
                .orderBy(col('prediction').desc()).show(10, truncate=False)
    
    # spark.stop()
Esempio n. 34
0
    predictions[park] = models[park].transform(test_ds[park])
else:
  for park in park_data_with_date_dict:
    #standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
    #pred4data = standardScaler.transform(pred4data)
    lr = LinearRegression(maxIter = 10)
    #elastic = forma ri regolarizzazione che mixa LASSO, e RIDGE
    paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).addGrid(lr.fitIntercept, [False, True]).addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]).build() 
    tvs = TrainValidationSplit(estimator=lr,
                               estimatorParamMaps=paramGrid,
                               evaluator=RegressionEvaluator(),
                               # 80% of the data will be used for training, 20% for validation.
                               trainRatio=0.8)

    # Run TrainValidationSplit, and choose the best set of parameters.
    models[park] = tvs.fit(train_ds[park])
    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    predictions[park] = models[park].transform(test_ds[park] )
  saveModels(models,"LinearRegressionModel_","linear")

# COMMAND ----------

#ATTENZIONE, la predizione è STATICA ... quindi anche se il plot è temporale, è solo per avere una visualizzazione "semplice" (altrimenti per fare un vero e proprio scatterplot variabili-predizione/valore vero ci vorrebbe una riduzione di dimensionalità ... ma mi sembra uno sforzo inutile)
def plotPredictions(code):
  fig, axes = plt.subplots()
  axes.plot([p.label for p in predictions[code].collect()], color = "blue")
  axes.plot([p.prediction for p in predictions[code].collect()], color = "red")
  #fig.autofmt_xdate()
  display(fig)
Esempio n. 35
0
    .addGrid(mlpc.layers, generateLayersCombination(hidden_layers = [1,2,5], input_layer = [5], output_layer = [2])) \
    .addGrid(mlpc.stepSize, [0.5,0.2,0.1,0.05,0.02]) \
    .build()

print("Calculating best model...")
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
start = time.time()
tvs = TrainValidationSplit(
    estimator=mlpc,
    estimatorParamMaps=paramGrid,
    evaluator=RegressionEvaluator(),
    # 80% of the data will be used for training, 20% for validation.
    trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train_data)

# Save the parameters of the best model into variables
bestmodel = model.bestModel
layers = list(bestmodel._java_obj.parent().getLayers())
iters = bestmodel._java_obj.parent().getMaxIter()
# solver = bestmodel._java_obj.parent().getSolver()
# tol = bestmodel._java_obj.parent().getTol()
lr = bestmodel._java_obj.parent().getStepSize()

end = time.time()

print("---------------------- Best model info ----------------------")
print("Max epochs : " + str(iters))
print("Learning rate : " + str(lr))
print("Layers : " + str(layers))
Esempio n. 36
0
                              evaluator=evaluator,
                              estimatorParamMaps=grid)

# COMMAND ----------

import mlflow
#from mlflow import spark
#import mlflow.mleap #fonctionne avec Spark 3.0 ?
#import mlflow.pyfunc
#import mleap.pyspark

# COMMAND ----------

with mlflow.start_run(run_name='TripDuration_lr'):

    tunedModel = tuning.fit(trainingData)

    # We log a custom tag, a custom metric, and the best model to the main run.
    mlflow.set_tag('Citibike_training', 'Data_team')

    rmse = evaluator.evaluate(tunedModel.transform(testData),
                              {evaluator.metricName: "rmse"})
    r2 = evaluator.evaluate(tunedModel.transform(testData),
                            {evaluator.metricName: "r2"})
    mae = evaluator.evaluate(tunedModel.transform(testData),
                             {evaluator.metricName: "mae"})
    mse = evaluator.evaluate(tunedModel.transform(testData),
                             {evaluator.metricName: "mse"})

    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r2', r2)
Esempio n. 37
0
# In[27]:

paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [50, 100]).addGrid(
    rf.maxDepth, [30, 15]).build()

# In[28]:

tvs = TrainValidationSplit(estimator=rf,
                           estimatorParamMaps=paramGrid,
                           evaluator=bce,
                           trainRatio=0.8)

# In[ ]:

tvs.fit(tr_data)

# In[ ]:

prediction = tvs.transform(test)

# In[5]:

# convert to .py file. Now let's submit to queue

# HW!
# Items to Work on: 3 Options:
#
# 1. ML
#  * make a logistic regression model
#  * use cross-validation to search a good space of logisitc regression hyoerparams
Esempio n. 38
0
    def test_copy(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(
            estimator=iee,
            estimatorParamMaps=grid,
            evaluator=evaluator,
            collectSubModels=True
        )
        tvsModel = tvs.fit(dataset)
        tvsCopied = tvs.copy()
        tvsModelCopied = tvsModel.copy()

        for param in [
            lambda x: x.getCollectSubModels(),
            lambda x: x.getParallelism(),
            lambda x: x.getSeed(),
            lambda x: x.getTrainRatio(),
        ]:
            self.assertEqual(param(tvs), param(tvsCopied))

        for param in [
            lambda x: x.getSeed(),
            lambda x: x.getTrainRatio(),
        ]:
            self.assertEqual(param(tvsModel), param(tvsModelCopied))

        self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid,
                         "Copied TrainValidationSplit has the same uid of Estimator")

        self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid)
        self.assertEqual(len(tvsModel.validationMetrics),
                         len(tvsModelCopied.validationMetrics),
                         "Copied validationMetrics has the same size of the original")
        for index in range(len(tvsModel.validationMetrics)):
            self.assertEqual(tvsModel.validationMetrics[index],
                             tvsModelCopied.validationMetrics[index])

        tvsModel.validationMetrics[0] = 'foo'
        self.assertNotEqual(
            tvsModelCopied.validationMetrics[0],
            'foo',
            "Changing the original validationMetrics should not affect the copied model"
        )
        tvsModel.subModels[0].getInducedError = lambda: 'foo'
        self.assertNotEqual(
            tvsModelCopied.subModels[0].getInducedError(),
            'foo',
            "Changing the original subModels should not affect the copied model"
        )
Esempio n. 39
0
    data = spark.read.format("libsvm")\
        .load("data/mllib/sample_linear_regression_data.txt")
    train, test = data.randomSplit([0.7, 0.3])
    lr = LinearRegression(maxIter=10, regParam=0.1)

    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # TrainValidationSplit will try all combinations of values and determine best model using
    # the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
        .build()

    # In this case the estimator is simply the linear regression.
    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = TrainValidationSplit(estimator=lr,
                               estimatorParamMaps=paramGrid,
                               evaluator=RegressionEvaluator(),
                               # 80% of the data will be used for training, 20% for validation.
                               trainRatio=0.8)

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train)
    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    prediction = model.transform(test)
    for row in prediction.take(5):
        print(row)
    # $example off$
    spark.stop()