Esempio n. 1
0
def TVS(estimator, paramGrid, dataTrain, dataTest):
    # Definimos el TVS
    tvs = TrainValidationSplit(estimator=estimator,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           # 80% entrenamiento, 20% validacion
                           trainRatio=0.8)
    # Entrenamos el modelo con la mejor combinacion 
    # de parametros del grid por defecto
    model = tvs.fit(dataTrain)
    # Obtenemos predicciones sobre Test
    predictions = model.transform(dataTest)
    return predictions, model
Esempio n. 2
0
def process(spark, train_data, test_data):
    #train_data - путь к файлу с данными для обучения модели
    #test_data - путь к файлу с данными для оценки качества модели
    #сейчас использую только train_data
    #запуск python PySparkMLFit.py train.parquet validate.parquet

    #загружаю train_data
    train_data = spark.read.parquet(train_data)

    #обучаю модель
    #add feature
    feature = VectorAssembler(inputCols=train_data.columns[:7],
                              outputCol="features")

    # Train a GBT model.
    gbt = GBTRegressor(labelCol="ctr", featuresCol="features", maxIter=10)

    #pipeline
    pipeline = Pipeline(stages=[feature, gbt])

    paramGrid = ParamGridBuilder().addGrid(
        gbt.maxDepth,
        [2, 3, 4, 5, 6, 7, 8, 9]).addGrid(gbt.maxBins,
                                          [10, 16, 20, 24, 32, 36]).build()

    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=RegressionEvaluator(labelCol="ctr",
                                      predictionCol="prediction",
                                      metricName="rmse"),
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train_data)

    #делаю выборку для тестирования
    (training_data1, test_data) = train_data.randomSplit([0.8, 0.2], seed=42)

    #по тестовой выборке выделенной из train_data считаю rmse и вывожу его
    prediction = model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="ctr",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(prediction)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    #сохраняю модель
    model.bestModel.write().overwrite().save("model")
Esempio n. 3
0
def model_training(training_data, param_info):
    # 获取参数表以及gbt模型
    param_grid, rf = model_setting(param_info)
    # 建立评估器,计算模式为准确值
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
    # 建立超参数验证模型
    tvs = TrainValidationSplit(estimator=rf,
                               estimatorParamMaps=param_grid,
                               evaluator=evaluator,
                               trainRatio=0.8)
    # 训练模型
    model = tvs.fit(dataset=training_data)
    # 返回最优模型
    return model.bestModel
def get_validation(by='cv'):
    if by is 'cv':
        return CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=10)
    elif by is 'tvs':
        return TrainValidationSplit(estimator=pipeline,
                                    estimatorParamMaps=paramGrid,
                                    evaluator=evaluator,
                                    trainratio=0.8)
    else:
        print("lütfen tvsden ve cvden birini seçiniz")
        return None
def kNN_with_k_fixed(df, k):
    knn = KNNClassifier(featuresCol='features',
                        labelCol='label',
                        topTreeSize=1000,
                        topTreeLeafSize=10,
                        subTreeLeafSize=30)
    grid = ParamGridBuilder().addGrid(knn.k, [k]).build()
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                              labelCol='label')
    tts = TrainValidationSplit(estimator=knn,
                               estimatorParamMaps=grid,
                               evaluator=evaluator,
                               trainRatio=0.6666)
    ttsModel = tts.fit(df)
    result = evaluator.evaluate(ttsModel.transform(df))
    print('kNN:k', k, result)
Esempio n. 6
0
    def _run_test_save_load_trained_model(self, LogisticRegressionCls,
                                          LogisticRegressionModelCls):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )
        lr = LogisticRegressionCls()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(
            estimator=lr,
            estimatorParamMaps=grid,
            evaluator=evaluator,
            collectSubModels=True,
            seed=42,
        )
        tvsModel = tvs.fit(dataset)
        lrModel = tvsModel.bestModel

        lrModelPath = temp_path + "/lrModel"
        lrModel.save(lrModelPath)
        loadedLrModel = LogisticRegressionModelCls.load(lrModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedTvsModel = TrainValidationSplitModel.load(tvsModelPath)
        for param in [
                lambda x: x.getSeed(),
                lambda x: x.getTrainRatio(),
        ]:
            self.assertEqual(param(tvsModel), param(loadedTvsModel))

        self.assertTrue(
            all(
                loadedTvsModel.isSet(param)
                for param in loadedTvsModel.params))
Esempio n. 7
0
def main(name='Loan_model'):
    logger = logging.getLogger(__name__)
    spark = SparkSession.builder.appName(f'{name}').getOrCreate()

    data = spark.read.csv(path, inferSchema=True, header=True)

    logger.info(f'Vectorising Features')
    data = get_features(data, spark, target)

    logger.info(f'Obtaining Weight balance')
    data = data.withColumn('weights', weight_balance(data, col('label')))

    logger.info(f'Create train and testing split 80-20')
    train, test = data.randomSplit([.8, .2], seed=1234)

    logger.info(f'Training and Optimising model')

    lr = LogisticRegression(
        featuresCol=data.columns[0],
        labelCol=data.columns[1],
        weightCol=data.columns[2],
        maxIter=100,
    )

    pipeline = Pipeline(stages=[lr])

    paramGrid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1]) \
        .addGrid(lr.elasticNetParam, [0.001, 0.01, 0.1, 1]) \
        .build()

    model_tune = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'),
        trainRatio=0.8)

    model = model_tune.fit(train)

    metrics = evaluate_model(model, test, spark)

    model.bestModel.write().overwrite().save(output_path)
    metrics.toPandas().to_csv(f'{output_path}testset_metrics.csv')

    logger.info(f'Model and metrics exported to {output_path}')

    return model, metrics
def RForest_with_maxFeatures_maxDepth_fixed(df, max_depth, max_features):
    RForest = DecisionTreeClassifier(featuresCol='features',
                                     labelCol='label',
                                     impurity='gini',
                                     maxMemoryInMB=1024)
    grid = ParamGridBuilder().addGrid(RForest.maxDepth, [max_depth]).addGrid(
        RForest.maxBins, [max_features]).build()
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                              labelCol='label')
    tts = TrainValidationSplit(estimator=RForest,
                               estimatorParamMaps=grid,
                               evaluator=evaluator,
                               trainRatio=0.6666)
    ttsModel = tts.fit(df)
    result = evaluator.evaluate(ttsModel.transform(df))
    print('RForest:maxDepth', max_depth, ':maxBins', max_features, ':result',
          result)
Esempio n. 9
0
def validate(estimator, train, grid):
    """ Elige los hiperparámetros de "estimator" a partir de "grid" y utilizando el 
    20% de los datos de "train" como partición de validación. Como métrica de 
    comparación, utiliza AUC.
    """
    tvs = TrainValidationSplit(
        estimator=estimator,
        estimatorParamMaps=grid,
        evaluator=BinaryClassificationEvaluator(labelCol="class"),
        trainRatio=0.8,
        seed=89)

    model = tvs.fit(train)
    for i, item in enumerate(model.getEstimatorParamMaps()):
        grid = ["%s: %s" % (p.name, str(v)) for p, v in item.items()]
        print(grid,
              model.getEvaluator().getMetricName(), model.validationMetrics[i])
Esempio n. 10
0
 def test_parallel_evaluation(self):
     dataset = self.spark.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvs.setParallelism(1)
     tvsSerialModel = tvs.fit(dataset)
     tvs.setParallelism(2)
     tvsParallelModel = tvs.fit(dataset)
     self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
Esempio n. 11
0
def train(data: DataFrame, esti: Estimator, eid: str,
          param_grid_builder: Callable[[Estimator], list]) -> PipResult:
    try:
        print(f"--- train {eid}")
        # Prepare training and test data.
        df_train, df_test = data.randomSplit([0.9, 0.1], seed=12345)

        # We use a ParamGridBuilder to construct a grid of parameters to search over.
        # TrainValidationSplit will try all combinations of values and determine best model using
        # the evaluator.
        params = param_grid_builder(esti)
        print(f"--- params")
        pprint(params)

        # In this case the estimator is simply the linear regression.
        # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
        tvs = TrainValidationSplit(
            estimator=esti,
            estimatorParamMaps=params,
            evaluator=RegressionEvaluator(),
            # 80% of the data will be used for training, 20% for validation.
            trainRatio=0.8)

        # Run TrainValidationSplit, and choose the best set of parameters.
        trained_models: TrainValidationSplitModel = tvs.fit(df_train)

        # Make predictions on test data. model is the model with combination of parameters
        # that performed best.
        predictions = trained_models.transform(df_test) \
            .select("features", "label", "prediction")

        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(labelCol="label",
                                        predictionCol="prediction",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)

        print(f"-- Root Mean Squared Error (RMSE) on test data = {rmse}")
        fnam = cm.fnam(eid)
        hlp.save_model(trained_models.bestModel, hlp.get_datadir(), fnam)
        print(f"-- saved model to {fnam}")
        return PipResult(rmse, trained_models.bestModel, "OK")
    except Exception:
        print(tb.format_exc())
        return PipResult(0.0, None, "ERROR")
Esempio n. 12
0
def func2():
    """
    PIPeline机器学习
    :return:
    """
    row_df = sqlContext.read.format("csv").option("header", True).option("delimiter", "\t").load(Path + "train.tsv")
    df = row_df.select(["url", "alchemy_category"]  # 不需要转换的字段
                       + [replace_question(col(column)).cast("double").alias(column) for column in
                          row_df.columns[4:]])  # 需要转换的字段
    train_df, test_df = df.randomSplit([0.7, 0.3])
    ###建立机器学习Pipeline流程
    stringIndexer = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index")  # 创建indexer,字符串代码化
    encoder=OneHotEncoder(dropLast=False,inputCol="alchemy_category_index",outputCol="alchemy_category_indexVec")
    assemblerInputs = ["alchemy_category_indexVec"] + row_df.columns[4:-1]
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14)
    pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,dt])
    print(pipeline.getStages())

    ###使用Pipeline进行数据处理和训练
    pipelineModel=pipeline.fit(train_df)#训练
    print(pipelineModel.stages[3])#第三阶段会产生模型,这里看看模型
    print(pipelineModel.stages[3].toDebugString)

    ####使用pipeline进行预测
    predicted=pipelineModel.transform(test_df)
    print(predicted.columns)
    predicted.select("url","features","rawprediction","probability","label","prediction").show(5)
    predicted.select( "probability",  "prediction").take(5)

    ####评估模型准确率
    evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label",metricName="areaUnderROC")
    auc=evaluator.evaluate(predicted)
    print("auc:",auc)

    #要遍历的参数们,选择最佳参数组合
    paramGrid=ParamGridBuilder().addGrid(dt.impurity,["gini","entory"]).addGrid(dt.maxDepth,[5,10,15]).addGrid(dt.maxBins,[10,15,20]).build()
    tvs=TrainValidationSplit(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,trainRatio=0.8)#trainRatio 数据会8:2的比例分为训练集,验证集
    tvs_pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,tvs])
    tvs_pipelineModel=tvs_pipeline.fit(train_df)
    bestModel=tvs_pipelineModel.stages[3].bestModel
    print("bestModel",bestModel)
    predictions=tvs_pipelineModel.transform(test_df)
    auc2=evaluator.evaluate(predictions)
    print("auc2:",auc2)
Esempio n. 13
0
    def _run_test_save_load_nested_estimator(self, LogisticRegressionCls):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )
        ova = OneVsRest(classifier=LogisticRegressionCls())
        lr1 = LogisticRegressionCls().setMaxIter(100)
        lr2 = LogisticRegressionCls().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        tvs = TrainValidationSplit(estimator=ova,
                                   estimatorParamMaps=grid,
                                   evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), grid)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)

        originalParamMap = tvs.getEstimatorParamMaps()
        loadedParamMap = loadedTvs.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
Esempio n. 14
0
def example_train_cluster(df):
    # Expected input: inventory
    vec = VectorAssembler(inputCols=["price", "size", "lat", "lng"],
                          outputCol="v")
    kmeans = KMeans(featuresCol="v", predictionCol="pred")

    pipe = Pipeline(stages=[vec, kmeans])
    ev = ClusteringEvaluator(
        predictionCol="pred", featuresCol="v",
        distanceMeasure="cosine")  # cosine, squaredEuclidean
    grid = ParamGridBuilder().addGrid(kmeans.k, [3, 4, 5]).build()
    cv = TrainValidationSplit(estimator=pipe,
                              estimatorParamMaps=grid,
                              evaluator=ev,
                              trainRatio=0.75)
    model = cv.fit(df)

    return model
Esempio n. 15
0
def tune_model(estimator, param_grid, evaluator, train_data):
    tvs = TrainValidationSplit(estimator=estimator,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator,
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8,
                           seed=16)
    
    model = tvs.fit(train_data)
    
    # print results for each combination
    for i, item in enumerate(model.getEstimatorParamMaps()):
        grid = ["%s: %s" % (p.name, str(v)) for p, v in item.items()]
        print(grid, model.getEvaluator().getMetricName(),
              model.validationMetrics[i])


    return model.bestModel
def trainAndEvalModelByRandomForestClassifierAndTrainValidationSplit(stages, train_df, test_df, evaluator):
    '''
    使用 RandomForestClassifier 分类器和 TrainValidationSplit 训练验证模型,并找出最佳模型
    :param stages:
    :param train_df:
    :param test_df:
    :param evaluator:
    :return:
    '''
    rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=10)
    paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['gini', 'entropy']).addGrid(rf.maxDepth, [5, 10, 15]).addGrid(rf.maxBins, [10, 15, 20]).addGrid(rf.numTrees, [10, 20, 30]).build()
    rftvs = TrainValidationSplit(estimator=rf, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8)
    rftvsPipeline = Pipeline(stages=stages+[rftvs])
    rftvsPipelineModel = rftvsPipeline.fit(train_df)
    bestModel = rftvsPipelineModel.stages[3].bestModel
    predictions = rftvsPipelineModel.transform(test_df)
    auc = evaluator.evaluate(predictions)
    return (bestModel, predictions, auc)
Esempio n. 17
0
def random_forest_classifier(training_data, testing_data):
    assembler = VectorAssembler(inputCols=[
        "col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9",
        "col10", "col11", "col12", "col13", "col14", "col15", "col16", "col17",
        "col19", "col20", "col21", "col22", "col23", "col24", "col25"
    ],
                                outputCol="features")
    training_data_vector = assembler.transform(training_data)
    training_data_vector = training_data_vector.select('index', 'features',
                                                       'label')

    label_indexer = StringIndexer(
        inputCol="label", outputCol="indexedLabel").fit(training_data_vector)
    feature_indexer = VectorIndexer(inputCol="features",
                                    outputCol="indexedFeatures",
                                    maxCategories=15).fit(training_data_vector)

    rf = RandomForestClassifier(labelCol="indexedLabel",
                                featuresCol="features",
                                numTrees=10)
    pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])

    param_grid = ParamGridBuilder() \
        .addGrid(feature_indexer.maxCategories, [5, 15, 25]) \
        .addGrid(rf.numTrees, [10, 50, 100]) \
        .addGrid(rf.maxDepth, [5, 10]) \
        .build()

    tvs = TrainValidationSplit(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=RegressionEvaluator(),
                               trainRatio=0.8)

    model = tvs.fit(training_data_vector)

    testing_data_vector = assembler.transform(testing_data)

    predictions = model.transform(
        testing_data_vector.select('index', 'features', 'bid', 'target'))

    selected = predictions.select("index", 'bid', 'target', "probability",
                                  "prediction")

    __output('RandomForestClassifier', selected)
Esempio n. 18
0
def predictDataForStation(stationData, columnName, station_id):
    columnsList = ["max_temp", "med_temp", "min_temp", "max_pressure", "min_pressure", "precip", "insolation"]
    # assembler = VectorAssembler(inputCols=columnsList,outputCol="features")
    assembler = VectorAssembler(inputCols=[columnName], outputCol="features")
    assembledData = assembler.transform(stationData)

    feature_data = assembledData.withColumn("label", stationData[columnName]).withColumn("features",
                                                                                         assembledData.features)

    print("Getting training data...")
    test_data = feature_data.sample(False, 0.1)
    train_data = feature_data.sample(False, 0.9)
    print("Test data: " + str(test_data.count()) + " , Train data: " + str(train_data.count()))

    # BestModel
    lr = LinearRegression()

    paramGrid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.1, 0.01, 0.001, 0.0001, 0.0001]) \
        .addGrid(lr.fitIntercept, [False, True]) \
        .addGrid(lr.maxIter, [1, 10, 50, 100]) \
        .build()

    try:
        print("Calculating and training the best model")
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(),
                                   trainRatio=0.8)
        # Fit the model
        lrModel = tvs.fit(train_data)
        saveModel(lrModel.bestModel, station_id, columnName)


    ##### AQUESTES LINIES SON LES BONES!!!!######
    # predictions = lrModel.transform(test_data).select("measure_date","station_id",columnName,"prediction")
    # groupedPredictions = predictions.groupBy("station_id").agg(avg(columnName),avg("prediction"))
    # insertDataIntoDatabase(groupedPredictions,columnName,station_id)

    except IllegalArgumentException as error:
        print("#####IllegalArgumentException on :\t " + str(station_id) + " on " + str(columnName) + "#####")
        print("IllegalArgumentException : {0}".format(error))
    except py4j.protocol.Py4JJavaError as error:
        print("#####Py4JJavaError on :\t " + str(station_id) + " on " + str(columnName) + "#####")
        print("Py4JJavaError : {0}".format(error))
Esempio n. 19
0
def lr_train_tvs(data):
    #Logistic Regression using Count Vector Features
    label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label")
    lsmodel=label_stringIdx.fit(data)
    data=lsmodel.transform(data)
    #(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100)
    countVectors = CountVectorizer(inputCol="filtered", outputCol="cfeatures", vocabSize=10000, minDF=5)
    '''hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",minDocFreq=5)'''
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    lr = LogisticRegression(regParam=0.3, elasticNetParam=0,featuresCol=countVectors.getOutputCol(), labelCol="label")
    pipeline = Pipeline(stages=[countVectors,lr])
    grid = ParamGridBuilder().addGrid(lr.maxIter, [10,15,20]).build()
    crossval = TrainValidationSplit(estimator=pipeline,
                              estimatorParamMaps=grid,
                              evaluator=evaluator,
                              trainRatio=0.9)
    cvmodel=crossval.fit(data)
    return (evaluator.evaluate(cvmodel.transform(data)),lsmodel.labels,cvmodel)
def trainAndEvalModelByDecisionTreeClassifierAndTrainValidationSplit(stages, train_df, test_df, evaluator):
    '''
    使用 DecisionTreeClassifier 决策树分类器和 TrainValidationSplit 进行模型训练和验证,并找出最佳模型
    :param stages:
    :param train_df:
    :param test_df:
    :param evaluator:
    :return:
    '''
    dt = DecisionTreeClassifier(labelCol='label', featuresCol='features', impurity='gini', maxDepth=10, maxBins=14)
    paramGrid = ParamGridBuilder().addGrid(dt.impurity, ['gini', 'entropy']).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(dt.maxBins, [10, 15, 20]).build()  # 执行模型参数训练 2*3*3=18次
    tvs = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8)  # 创建模型训练验证对象;参数 trainRatio=0.8 表示:训练验证前会将数据按照 8:2 的比例分成训练数据与验证数据
    tvsPipline = Pipeline(stages=stages+[tvs])  # 建立模型训练 Pipeline 流程
    tvsPiplineModel = tvsPipline.fit(train_df)  # 生成训练后的模型
    bestModel = tvsPiplineModel.stages[3].bestModel
    # print('========== [trainAndEvalModelByTrainValidationSplit] >>>> 查看训练完成后的最佳决策树模型规则:')
    # print(bestModel.toDebugString[:500])  # 只显示前500个字符
    predictions = tvsPiplineModel.transform(test_df)
    auc = evaluator.evaluate(predictions)
    return (bestModel, predictions, auc)
Esempio n. 21
0
    def gridCV(self):
        param_grid = ParamGridBuilder() \
            .addGrid(self.factor_model.rank, [5, 10, 15, 20]) \
            .addGrid(self.factor_model.maxIter, [10, 15, 20, 30, 35]) \
            .addGrid(self.factor_model.regParam, [0.05, 0.1, 0.15, 0.2]) \
            .build()

        evaluator = RegressionEvaluator(predictionCol="prediction",
                                        labelCol="rating",
                                        metricName="rmse")

        grid_model = TrainValidationSplit(estimator=self.factor_model,
                                          estimatorParamMaps=param_grid,
                                          evaluator=evaluator)

        model = grid_model.fit(self.train)
        best_model = model.bestModel
        print('rank', best_model.rank)
        print('max iter', best_model._java_obj.parent().getMaxIter())
        print('reg param', best_model._java_obj.parent().getRegParam())
Esempio n. 22
0
def tuning(classifier, paramGrid, train):
    tvs = TrainValidationSplit(
        estimator=classifier,
        estimatorParamMaps=paramGrid,
        evaluator=BinaryClassificationEvaluator(),
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train)

    ParamMaps = model.getEstimatorParamMaps()
    for i, params in enumerate(ParamMaps):
        print("---------_", str(i), "_---------", " AUC: ",
              str(model.validationMetrics[i]))
        for param, value in params.items():
            print(param.name, ": ", str(value), "; ", end='')
        print("\n")

    return model.bestModel
def test_meta_estimator_disable_post_training_autologging(dataset_regression):
    mlflow.pyspark.ml.autolog()
    lr = LinearRegression(solver="l-bfgs", regParam=0.01)
    eval_dataset = dataset_regression.sample(fraction=0.3, seed=1)
    lrParamMaps = [
        {
            lr.maxIter: 1,
            lr.standardization: False
        },
        {
            lr.maxIter: 200,
            lr.standardization: True
        },
        {
            lr.maxIter: 2,
            lr.standardization: False
        },
    ]
    eva = RegressionEvaluator(metricName="rmse")
    estimator = TrainValidationSplit(estimator=lr,
                                     estimatorParamMaps=lrParamMaps,
                                     evaluator=eva)

    with mock.patch(
            "mlflow.pyspark.ml._AutologgingMetricsManager.register_model"
    ) as mock_register_model, mock.patch(
            "mlflow.sklearn._AutologgingMetricsManager.is_metric_value_loggable"
    ) as mock_is_metric_value_loggable, mock.patch(
            "mlflow.pyspark.ml._AutologgingMetricsManager.log_post_training_metric"
    ) as mock_log_post_training_metric, mock.patch(
            "mlflow.pyspark.ml._AutologgingMetricsManager.register_prediction_input_dataset"
    ) as mock_register_prediction_input_dataset:
        with mlflow.start_run():
            model = estimator.fit(dataset_regression)

        model.transform(eval_dataset)

        mock_register_model.assert_called_once()
        mock_is_metric_value_loggable.assert_not_called()
        mock_register_prediction_input_dataset.assert_not_called()
        mock_log_post_training_metric.assert_not_called()
Esempio n. 24
0
def testCvWithLr():
    spark = createLocalSparkSession()
    df = getDatasetMinist(spark)
    train, test = df.randomSplit([0.9, 0.1], seed=12345)

    lr = TFNeuralNetwork()
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.lr, [0.1, 0.01]) \
        .addGrid(lr.maxIter, [10]) \
        .build()

    tvs = TrainValidationSplit(
        estimator=lr,
        estimatorParamMaps=paramGrid,
        evaluator=MulticlassClassificationEvaluator(),
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    model = tvs.fit(train)
    pred = model.transform(test)
    pred.show()
Esempio n. 25
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [
                (Vectors.dense([0.0]), 0.0),
                (Vectors.dense([0.4]), 1.0),
                (Vectors.dense([0.5]), 0.0),
                (Vectors.dense([0.6]), 1.0),
                (Vectors.dense([1.0]), 1.0),
            ] * 10,
            ["features", "label"],
        )
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr,
                                   estimatorParamMaps=grid,
                                   evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        self.assertEqual(len(tvsModel.subModels), len(grid))

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testTrainValidationSplitSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        tvsModel.save(savingPathWithSubModels)
        tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
        self.assertEqual(len(tvsModel3.subModels), len(grid))
        tvsModel4 = tvsModel3.copy()
        self.assertEqual(len(tvsModel4.subModels), len(grid))

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        tvsModel.write().option("persistSubModels",
                                "false").save(savingPathWithoutSubModels)
        tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
        self.assertEqual(tvsModel2.subModels, None)

        for i in range(len(grid)):
            self.assertEqual(tvsModel.subModels[i].uid,
                             tvsModel3.subModels[i].uid)
Esempio n. 26
0
def make_weather_trainers(trainRatio, estimator_gridbuilders, metricName=None):
    """Construct a list of TrainValidationSplit estimators for weather data
       where `estimator_gridbuilders` is a list of (Estimator, ParamGridBuilder) tuples
       and 0 < `trainRatio` <= 1 determines the fraction of rows used for training.
       The RegressionEvaluator will use a non-default `metricName`, if specified.
    """
    feature_cols = ['latitude', 'longitude', 'elevation', 'doy']
    column_names = dict(featuresCol="features",
                        labelCol="tmax",
                        predictionCol="tmax_pred")

    query = "SELECT station,date, dayofyear(date) as doy, latitude, longitude, elevation,tmax  FROM __THIS__"

    getDOY = SQLTransformer(
        statement=query
    )  # TODO: engineer a day of year feature 'doy' from schema

    feature_assembler = VectorAssembler(inputCols=feature_cols,
                                        outputCol=column_names["featuresCol"])
    ev = (RegressionEvaluator().setLabelCol(
        column_names["labelCol"]).setPredictionCol(
            column_names["predictionCol"]))
    if metricName:
        ev = ev.setMetricName(metricName)
    tvs_list = []
    for est, pgb in estimator_gridbuilders:
        est = est.setParams(**column_names)

        pl = Pipeline(
            stages=[getDOY, feature_assembler,
                    est])  # TODO: Construct a pipeline with estimator est

        paramGrid = pgb.build()
        tvs_list.append(
            TrainValidationSplit(estimator=pl,
                                 estimatorParamMaps=paramGrid,
                                 evaluator=ev,
                                 trainRatio=trainRatio))
    return tvs_list
Esempio n. 27
0
 def train(self):
     print('Building stages...')
     stages = []
     if(type(self.featurestages)!=list):
         self.featurestages=[self.featurestages]
     stages += self.featurestages
     
     
     #In case there is word2vec which has negative features, scale the features
     #to nonnegative values because naive bayes requires that
     if(('Word2Vec' in str(stages)) and ('NaiveBayes' in str(self.classifier))):
         print('Word2Vec and NaiveBayes detected, scaling to nonnegative [0.0,1.0]')
         stages[-1].setOutputCol('prefeatures')
         scaler  = MinMaxScaler(inputCol='prefeatures', outputCol='features')
         stages = stages + [scaler]
     
     
     stages += [self.classifier]
     self.pipeline = Pipeline(stages = stages)
     
     print('Using the following stages: ' + str(self.pipeline.getStages()))
     print('Training model...')
     if(self.classifiergrid == None):
         print('Training without a Parameter Grid...')
         dftrain, dftest = self.DF.randomSplit([0.80, 0.20])
         model = self.pipeline.fit(dftrain)
         self.predictions = model.transform(dftest)
         self.model=model
     else:
         print('Training with a Parameter Grid...')
         tvs = TrainValidationSplit(estimator=self.pipeline,
                                    estimatorParamMaps=self.classifiergrid,
                                    evaluator=BinaryClassificationEvaluator(),
                                    parallelism=4,
                                    trainRatio=0.8)
         dftrain, dftest = self.DF.randomSplit([0.80, 0.20])
         model = tvs.fit(dftrain)
         self.predictions = model.transform(dftest)
         self.model=model
Esempio n. 28
0
def GBT_CV(trainingData, testData):
    """
    Gradient Boosted Tree Regression Model Selection
    :param trainingData:
    :param testData:
    :return: Trained model, predictions
    """
    gbt = GBTRegressor(seed=42)
    paramGrid = ParamGridBuilder()\
        .addGrid(gbt.maxIter, [50, 100, 200, 300, 400, 500 ]) \
        .addGrid(gbt.maxDepth, [2, 6, 10, 14])\
        .build()

    tvs = TrainValidationSplit(
        estimator=gbt,
        estimatorParamMaps=paramGrid,
        evaluator=RegressionEvaluator(),
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)
    model = tvs.fit(trainingData)
    predictions = model.transform(testData)
    return model, predictions
Esempio n. 29
0
    def get_feature_importances(self, sdf):
        """

        :param sdf:
        :return:
        """

        evaluator = self.evaluator

        if evaluator is None:
            raise NotImplementedError("The evaluator parameter is not set.")

        space_grid = self.get_space_grid()

        model = self.model

        opt = self.opt

        if opt:
            crossval = TrainValidationSplit(estimator=model,
                                            estimatorParamMaps=space_grid,
                                            evaluator=evaluator,
                                            trainRatio=0.8)

            cvModel = crossval.fit(sdf)

            if isinstance(cvModel, PipelineModel):
                return cvModel.bestModel.stages[-1].featureImportances.toArray(
                )
            else:
                return cvModel.bestModel.featureImportances.toArray()
        else:
            fitted_model = model.fit(sdf)

            if isinstance(fitted_model, PipelineModel):
                return fitted_model.stages[-1].featureImportances.toArray()
            else:
                return fitted_model.featureImportances.toArray()
Esempio n. 30
0
def build_model(training):
    training.cache()
    columns = training.columns
    columns.remove("Occupancy")

    assembler = VectorAssembler(inputCols=columns, outputCol="featureVec")
    lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy")

    pipeline = Pipeline(stages=[assembler, lr])

    param_grid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \
        .build()

    evaluator = BinaryClassificationEvaluator(labelCol="Occupancy")

    validator = TrainValidationSplit(estimator=pipeline,
                                     estimatorParamMaps=param_grid,
                                     evaluator=evaluator,
                                     trainRatio=0.9)
    validator_model = validator.fit(training)

    return validator_model.bestModel