Example #1
0
def gbtClassifier(train, test):
    gbt = GBTClassifier(maxIter=10)
    gbtModel = gbt.fit(train)
    predictions = gbtModel.transform(test)
    predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction',
                       'probability').show(10)
    return predictions
Example #2
0
def basic_example(spark, resources_folder):
    data = spark.read.format('libsvm').load(resources_folder +
                                            'sample_libsvm_data.txt')
    data.printSchema()
    data.show()
    train_data, test_data = data.randomSplit([0.6, 0.4])
    dtc = DecisionTreeClassifier()
    rfc = RandomForestClassifier()
    gbtc = GBTClassifier()

    dtc_model = dtc.fit(train_data)
    rfc_model = rfc.fit(train_data)
    gbtc_model = gbtc.fit(train_data)

    dtc_predictions = dtc_model.transform(test_data)
    rfc_predictions = rfc_model.transform(test_data)
    gbtc_predictions = gbtc_model.transform(test_data)

    dtc_predictions.show()
    rfc_predictions.show()
    # GBT No tiene rawPrediction Column, si esta haciendo un predictor de clasificacion binaria o multiclasificacion
    # puede que pida el rawPrediction como un input
    gbtc_predictions.show()

    acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
    print("DTC Accuracy")
    print(acc_eval.evaluate(dtc_predictions))
    print("RFC Accuracy")
    print(acc_eval.evaluate(rfc_predictions))
    print("GBTC Accuracy")
    print(acc_eval.evaluate(gbtc_predictions))

    print(rfc_model.featureImportances)
 def test_gbt_classifier(self):
     raw_data = self.spark.createDataFrame([
         (1.0, Vectors.dense(1.0)),
         (0.0, Vectors.sparse(1, [], []))
     ], ["label", "features"])
     string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
     si_model = string_indexer.fit(raw_data)
     data = si_model.transform(raw_data)
     gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(model, 'Sparkml GBT Classifier', [
         ('features', FloatTensorType([1, feature_count]))
     ], spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np, expected, model, model_onnx,
                                 basename="SparkmlGBTClassifier")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
     compare_results(expected, output, decimal=5)
Example #4
0
    def exec_gradient_boost(self,
                            featuresCol1="features",
                            labelCol1="label",
                            predictionCol1="prediction",
                            maxIter1=30,
                            numClass1=2):
        '''
        Creates the Gradient Boost model Pipeline, this model is only applicable to binary classification problems
        Input: featureCol1: feature column name, labelCol: label column name, predictionCol1: prediction column name
                            model parameters: {maximum number of iterations}, numClass1: number of class labels restricted to 2
        Output: None
        '''
        #Initialize GradientBoost Model with parameters passed
        gb = GBTClassifier(featuresCol=featuresCol1,
                           labelCol=labelCol1,
                           predictionCol=predictionCol1,
                           maxIter=maxIter1)

        #Fit gradient boost model with training data
        gbModel = gb.fit(self.trainingData)

        #Make nb model predictions on testData
        predictions = gbModel.transform(self.testData)

        #Evaluate the results generated by the model prediction
        self.model_evaluator(predictions,
                             modelType="GradientBoost Model",
                             modelParams=str({'maxIter': maxIter1}),
                             numClass=numClass1)
def gradient_boosted_tree_classifier(training_data, test_data, validation_data):
    # ROC: 0.71
    gbt = GBTClassifier(featuresCol='scaled_features', labelCol='label', maxIter=10)
    gbtModel = gbt.fit(training_data)

    predict_valid = gbtModel.transform(validation_data)
    # predict_train = gbtModel.transform(training_data)
    predict_valid.show(5)

    evaluate_metrics(predict_valid)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label',
                                              metricName="areaUnderROC")

    model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_valid,
                    data_type="valid_data")

    paramGrid = (ParamGridBuilder()
                 .addGrid(gbt.maxDepth, [2, 4, 6])
                 .addGrid(gbt.maxBins, [20, 60])
                 .addGrid(gbt.maxIter, [10, 20])
                 .build())

    cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
    # Run cross validations.
    cvModel = cv.fit(training_data)
    predict_cross_valid = cvModel.transform(validation_data)

    model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_cross_valid,
                    data_type="valid_data")

    predict_final = cvModel.bestModel.transform(test_data)

    model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_final,
                    data_type="test_data")
Example #6
0
def evaluateGradientBoostTree(trainDF ,testDF):
    for stepsize in [0.01 ,0.1 ,1]:
        classifier = GBTClassifier(stepSize=stepsize)
        model = classifier.fit(trainDF)
        predictions = model.transform(testDF)
        print("Gradient Boost Tree with stepsize : {}".format(stepsize))
        printevaluatation(model,predictions)
Example #7
0
    def gbt(df, columns, input_col, **kwargs):
        """
        Runs a gradient boosting tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with gradient boosting tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = GBTClassifier(**kwargs)

        df = df.cols.rename(name_col(input_col, "index_to_string"), "label")

        gbt_model = model.fit(df)
        df_model = gbt_model.transform(df)
        return df_model, gbt_model
Example #8
0
def gbt(df, columns, input_col):
    """
    Runs a gradient boosting tree classifier for input DataFrame.
    :param df: Pyspark dataframe to analyze.
    :param columns: List of columns to select for prediction.
    :param input_col: Column to predict.
    :return: DataFrame with gradient boosting tree and prediction run.
    """

    assert_spark_df(df)

    assert isinstance(columns, list), "Error, columns must be a list"

    assert isinstance(input_col, str), "Error, input column must be a string"

    data = df.select(columns)
    feats = data.columns
    feats.remove(input_col)
    transformer = op.DataFrameTransformer(data)
    transformer.string_to_index(input_cols=input_col)
    transformer.vector_assembler(input_cols=feats)
    model = GBTClassifier()
    transformer.rename_col(columns=[(input_col + "_index", "label")])
    gbt_model = model.fit(transformer.df)
    df_model = gbt_model.transform(transformer.df)
    return df_model, gbt_model
Example #9
0
def gbdtClassification(df,arguments):
	from pyspark.ml.classification import GBTClassifier
	numTrees = 20
	stepSize = 0.1
	maxDepth = 5
	minInstancesPerNode = 1

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.numTrees != None:
		numTrees = float(arguments.numTrees)

	if arguments.stepSize != None:
		stepSize = float(arguments.stepSize)

	gbdt = GBTClassifier(maxIter=numTrees,
						 stepSize=stepSize,
						 maxDepth=maxDepth,
						 minInstancesPerNode=minInstancesPerNode)
	model = gbdt.fit(df)

	return model
def run_gradient_boost(tn_data, ts_data):
    gbt = GBTClassifier(featuresCol="scaled_features",
                        labelCol="output",
                        predictionCol="prediction")
    gbtModel = gbt.fit(tn_data)
    predictions = gbtModel.transform(ts_data)

    print_perf_eval(predictions)
Example #11
0
def transform_predictions(dataframe, spark):
    df_transformed = dataframe.drop("Patient addmited to regular ward (1=yes, 0=no)",
                                    "Patient addmited to semi-intensive unit (1=yes, 0=no)",
                                    "Patient addmited to intensive care unit (1=yes, 0=no)")

    df_transformed_no_missing = dismiss_missing_values(df_transformed)

    # build the dataset to be used as a rf_model base
    outcome_features = ["SARS-Cov-2 exam result"]
    required_features = ['Hemoglobin', 'Hematocrit', 'Platelets', 'Eosinophils', 'Red blood Cells', 'Lymphocytes',
                         'Leukocytes', 'Basophils', 'Monocytes']

    assembler = VectorAssembler(inputCols=required_features, outputCol='features')
    model_data = assembler.transform(df_transformed_no_missing)

    # split the dataset into train/test subgroups
    (training_data, test_data) = model_data.randomSplit([0.8, 0.2], seed=2020)

    # Random Forest classifier
    rf = RandomForestClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features', maxDepth=5)
    rf_model = rf.fit(training_data)
    rf_predictions = rf_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    rf_accuracy = multi_evaluator.evaluate(rf_predictions)

    # Decision Tree Classifier
    dt = DecisionTreeClassifier(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxDepth=3)
    dt_model = dt.fit(training_data)
    dt_predictions = dt_model.transform(test_data)
    dt_predictions.select(outcome_features + required_features).show(10)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    dt_accuracy = multi_evaluator.evaluate(dt_predictions)

    # Logistic Regression Model
    lr = LogisticRegression(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxIter=10)
    lr_model = lr.fit(training_data)
    lr_predictions = lr_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    lr_accuracy = multi_evaluator.evaluate(lr_predictions)

    # Gradient-boosted Tree classifier Model
    gb = GBTClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features')
    gb_model = gb.fit(training_data)
    gb_predictions = gb_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    gb_accuracy = multi_evaluator.evaluate(gb_predictions)

    rdd = spark.sparkContext.parallelize([rf_accuracy, dt_accuracy, lr_accuracy, gb_accuracy])
    predictions_dataframe = spark.createDataFrame(rdd, FloatType())

    return predictions_dataframe
Example #12
0
def universities_example(spark, resources_folder):
    data = spark.read.csv(resources_folder + 'College.csv',
                          header=True,
                          inferSchema=True)
    data.printSchema()
    data.show()

    assembler = VectorAssembler(inputCols=[
        'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad',
        'P_Undergrad', 'Outstate'
        'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio',
        'perc_alumni', 'Expend', 'Grad_Rate'
    ],
                                outputCol='features')
    data_assembled = assembler.transform(data)
    private_state_indexer = StringIndexer(inputCol='Private',
                                          outputCol='PrivateIndex')
    data_transformed = private_state_indexer.fit(data_assembled).transform(
        data_assembled)

    train_data, test_data = data_transformed.select(
        ['features', 'PrivateIndex']).randomSplit([0.6, 0.4])

    dtc = DecisionTreeClassifier(labelCol='PrivateIndex',
                                 featuresCol='features')
    rfc = RandomForestClassifier(labelCol='PrivateIndex',
                                 featuresCol='features')
    gbtc = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

    dtc_college_model = dtc.fit(train_data)
    rfc_college_model = rfc.fit(train_data)
    gbtc_college_model = gbtc.fit(train_data)

    dtc_predictions = dtc_college_model.transform(test_data)
    rfc_predictions = rfc_college_model.transform(test_data)
    gbtc_predictions = gbtc_college_model.transform(test_data)

    my_binary_evaluator = BinaryClassificationEvaluator(
        labelCol='PrivateIndex')
    print("DTC Evaluator")
    print(my_binary_evaluator.evaluate(dtc_predictions))
    print("RFC Evaluator")
    print(my_binary_evaluator.evaluate(rfc_predictions))
    print("DTC Evaluator")
    my_binary_evaluator = BinaryClassificationEvaluator(
        labelCol='PrivateIndex', rawPredictionCol='prediction')
    print(my_binary_evaluator.evaluate(gbtc_predictions))

    # No se puede hacer una evaluación del accuracy con un BinaryClassificationEvaluator para eso toca usar un
    # MulticlassClassificationEvaluator
    acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',
                                                 metricName='accuracy')
    rfc_accuracy = acc_eval.evaluate(rfc_predictions)
    print(rfc_accuracy)
Example #13
0
def testClassification(train, test):
    # Train a GradientBoostedTrees model.

    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = BinaryClassificationMetrics(predictionAndLabels)
    print("AUC %.3f" % metrics.areaUnderROC)
def testClassification(train, test):
    # Train a GradientBoostedTrees model.

    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = BinaryClassificationMetrics(predictionAndLabels)
    print("AUC %.3f" % metrics.areaUnderROC)
def gradientBoosting(df,
                     feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'],
                     maxIter=20,
                     stepSize=0.1):
    # Checks if there is a SparkContext running if so grab that if not start a new one
    # sc = SparkContext.getOrCreate()
    # sqlContext = SQLContext(sc)
    # sqlContext.setLogLevel('INFO')

    vector_assembler = VectorAssembler(inputCols=feature_list,
                                       outputCol="features")
    df_temp = vector_assembler.transform(df)

    df = df_temp.select(['label',
                         'features']).withColumnRenamed('label', 'label')

    (trainingData, testData) = df.randomSplit([0.7, 0.3])

    gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
    model = gbt.fit(trainingData)

    predictions = model.transform(testData)
    #predictions.select("prediction", "label").show(40)
    evaluator = BinaryClassificationEvaluator(labelCol="label")
    # accuracy = evaluator.evaluate(predictions, {evaluator.metricName:"Precision"})
    auc = evaluator.evaluate(predictions)

    # test distribution of outputs
    total = df.select('label').count()
    disk = df.filter(df.label == 0).count()
    cloud = df.filter(df.label == 1).count()

    # print outputs
    print('Gradient-Boosted Tree')
    print(' Cloud %{}'.format((cloud / total) * 100))
    print(' Disk %{}'.format((disk / total) * 100))
    print(feature_list)

    # print(" Test Error = {}".format((1.0 - accuracy) * 100))
    # print(" Test Accuracy = {}\n".format(accuracy * 100))
    print(" Test AUC = {}\n".format(auc * 100))

    misses = predictions.filter(predictions.label != predictions.prediction)
    # now get percentage of error
    disk_misses = misses.filter(misses.label == 0).count()
    cloud_misses = misses.filter(misses.label == 1).count()

    print(' Cloud Misses %{}'.format((cloud_misses / cloud) * 100))
    print(' Disk Misses %{}'.format((disk_misses / disk) * 100))

    return auc, 'Gradient Boosted: {}'.format(auc), model
Example #16
0
def gbt_classifier(training, testing):
    from pyspark.ml.classification import GBTClassifier

    # Train a GBT model.
    gbt = GBTClassifier(maxIter=10)
    # Train model.  This also runs the indexers.
    gbt_model = gbt.fit(training)

    # Make predictions.
    gbt_predictions = gbt_model.transform(testing)

    #Evaluate model
    gbt_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    gbt_accuracy = gbt_evaluator.evaluate(gbt_predictions)
    return gbt_accuracy
def main():
    spark = SparkSession \
        .builder \
        .appName("RandomForest") \
        .config("spark.executor.heartbeatInterval", "60s") \
        .getOrCreate()

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    sc.setLogLevel("INFO")

    # Loading the test data
    df_test = spark.read.parquet(sys.argv[1])

    df_test, df_train = df_test.randomSplit([0.3, 0.7])
    df_train_indexed=df_train.selectExpr("label as indexedLabel","features as indexedFeatures")
    df_test_indexed=df_test.selectExpr("label as indexedLabel","features as indexedFeatures")

    # # Load the model
    # rf_model = RandomForestClassificationModel.load(sys.argv[2])
    #
    # # Make the predictions
    # predictions = rf_model.transform(df_test)
    gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=100,maxBins=24000000)
    model=gbt.fit(df_train_indexed)
    predictions = model.transform(df_test_indexed)

    # predictionsRDD=predictions.rdd

    # predictionsRDD.saveAsTextFile(sys.argv[3]+"output.text")

    evaluator_acc = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel",
                                                      metricName="accuracy")
    accuracy = evaluator_acc.evaluate(predictions)

    print "accuracy *******************"
    print accuracy

    evaluator_pre = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel",
                                                      metricName="weightedPrecision")

    print "precision *******************"
    print evaluator_pre.evaluate(predictions)

    print "recall **********************"
    print MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel",
                                            metricName="weightedRecall").evaluate(predictions)
def evaluateGradientBoostTree(trainDF ,testDF):
    """
   Traning by  gradient boost tree classifiers  with some stepSize 
   params
   it returns bechmarkdata list for  stepSize params
   """
  
    benchmarkData = []
    for stepsize in [0.1]:
        classifier = GBTClassifier(stepSize=stepsize)
        model = classifier.fit(trainDF)
        predictions = model.transform(testDF)
        print("Gradient Boost Tree with stepsize : {}".format(stepsize))
        accuracy =  printevaluatation(model,predictions)
        benchmarkData += [("GBT", "sitepsize" ,stepsize ,float(accuracy))]
    return benchmarkData
Example #19
0
def models():
    rf_classifier = RandomForestClassifier(labelCol="label",
                                           featuresCol="features")
    print("Random Forest F1 = %g" % evaluate(rf_classifier))
    lsvc = LinearSVC(maxIter=50)
    print("Linear SVC F1 = %g" % evaluate(lsvc))
    gbt = GBTClassifier()
    print("GBT F1 = %g" % evaluate(gbt))

    mlp = MultilayerPerceptronClassifier(seed=1234, featuresCol='features')
    print("MLP F1 = %g" % evaluate(mlp))

    fm = FMClassifier()
    print('FM')
    evaluate(fm)
    featurize_lda()
    # NGrams
    # print("NGram Random Forest F1 = %g" % evaluate(rf_classifier, "ngrams"))
    # print("Ngram Linear SVC F1 = %g" % evaluate(lsvc, "ngrams"))
    # print("Ngram GBT F1 = %g" % evaluate(gbt, "ngrams"))
    # TF-IDF
    print("Ngram TF-IDF Random Forest F1 = %g" %
          evaluate(rf_classifier, "ngrams", "TF-IDF"))
    print("Ngram TF-IDF Linear SVC F1 = %g" %
          evaluate(lsvc, "ngrams", "TF-IDF"))
    print("Ngram TF-IDF GBT F1 = %g" % evaluate(gbt, "ngrams", "TF-IDF"))
    print("Words TF-IDF Random Forest F1 = %g" %
          evaluate(rf_classifier, "words", "TF-IDF"))
    print("Words TF-IDF Linear SVC F1 = %g" %
          evaluate(lsvc, "words", "TF-IDF"))
    print("Words TF-IDF GBT F1 = %g" % evaluate(gbt, "words", "TF-IDF"))
Example #20
0
def myGBT(training, test, labelColumnName):
    # 设置默认分类器
    gbt = GBTClassifier(featuresCol="features",
                        labelCol="label",
                        predictionCol="prediction",
                        cacheNodeIds=True)

    # 构造参数网格
    paramGrid_gbt = ParamGridBuilder().addGrid(
        gbt.maxDepth,
        [5, 8, 10]).addGrid(gbt.minInfoGain, [0.0, 0.001]).addGrid(
            gbt.minInstancesPerNode,
            [1, 3]).addGrid(gbt.maxIter,
                            [100, 150, 200]).addGrid(gbt.stepSize,
                                                     [0.01, 0.1]).build()

    # modeling类调用方式
    bestModel_gbt, best_epm_gbt, best_sampling_gbt = module.modeling()._fit(
        training, gbt, paramGrid_gbt, [0.2, 0.5, 0.8], 3)

    # 预测
    all_list = training.columns
    all_list.remove(labelColumnName)
    assembler = VectorAssembler().setInputCols(all_list).setOutputCol(
        "features_vector")
    test = assembler.transform(test)
    predictions_gbt = bestModel_gbt.transform(test)

    # 混淆矩阵
    predictions_gbt.groupBy('label', 'prediction').count().show()
    return predictions_gbt
def clf_gbt(feature, target):
    gbt = GBTClassifier(featuresCol=feature, labelCol=target, maxIter=10, seed=_seed, cacheNodeIds=True)
    paramGrid = ( ParamGridBuilder()
        .addGrid(gbt.maxDepth, [10,15,20])
        .addGrid(gbt.stepSize, [.05,.1,.5])
        .build())
    return gbt, paramGrid
    def __pipeline(self, modeling_code: str, classifiers_metadata: dict,
                   database_url_training: str, database_url_test: str) -> None:

        (features_training, features_testing, features_evaluation) = \
            self.__modeling_code_processing(
                modeling_code,
                self.__spark_session,
                database_url_training,
                database_url_test)

        classifier_switcher = {
            "LR": LogisticRegression(),
            "DT": DecisionTreeClassifier(),
            "RF": RandomForestClassifier(),
            "GB": GBTClassifier(),
            "NB": NaiveBayes(),
        }
        classifier_threads = []

        for name, metadata in classifiers_metadata.items():
            classifier = classifier_switcher[name]
            classifier_threads.append(
                self.__thread_pool.submit(
                    self.__classifier_processing,
                    classifier,
                    features_training,
                    features_testing,
                    features_evaluation,
                    metadata,
                ))

        for classifier in classifier_threads:
            testing_prediction, metadata_document = classifier.result()
            self.__save_classifier_result(testing_prediction,
                                          metadata_document)
Example #23
0
def Distr_GBTClassifier(xy_train, xy_test):
    gf = GBTClassifier(minInstancesPerNode=20, maxDepth=25)
    evalu = BinaryClassificationEvaluator()
    grid_1 = ParamGridBuilder()\
            .addGrid(gf.maxIter, [100])\
            .addGrid(gf.subsamplingRate, [0.5,0.8,1.0])\
            .build()
    cv_1 = CrossValidator(estimator=gf,
                          estimatorParamMaps=grid_1,
                          evaluator=evalu,
                          numFolds=5)
    #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型
    cvModel_1 = cv_1.fit(xy_train)
    print "Grid scores: "
    best_params_1 = Get_best_params(cvModel_1)['subsamplingRate']
    grid = ParamGridBuilder()\
            .addGrid(gf.maxIter, [300,500])\
            .addGrid(gf.subsamplingRate, [best_params_1,])\
            .build()
    cv = CrossValidator(estimator=gf,
                        estimatorParamMaps=grid,
                        evaluator=evalu,
                        numFolds=5)
    #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型
    cvModel = cv.fit(xy_train)
    best_params = Get_best_params(cvModel)

    print "Best parameters set found: %s" % best_params

    return cvModel.bestModel
 def gbtc(self, maxIter=10):
     self.time_calc.start_time('\nGradient-boosted tree classifier')
     gbtc = GBTClassifier(labelCol=self.label_col,
                          featuresCol=self.features_col,
                          maxIter=maxIter)
     self.classify('gbtc', gbtc, True)
     self.time_calc.end_time('Gradient-boosted tree classifier')
Example #25
0
    def _spark_rf(self):
        self.df = spark.createDataFrame(self.data)

        features = []
        for col in self.df.columns:
            if col == 'pred':
                continue
            else:
                features.append(col)

        (trainingData, testData) = self.df.randomSplit([0.7, 0.3],
                                                       seed=24234232)

        assembler = VectorAssembler(inputCols=features, outputCol="features")
        #rf = RandomForestClassifier(labelCol="pred", featuresCol="features", numTrees=500)
        gbt = gbt = GBTClassifier(labelCol="pred",
                                  featuresCol="features",
                                  maxIter=200)
        pipeline = Pipeline(stages=[assembler, gbt])

        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)

        # Select (prediction, true label) and compute test error
        evaluator = MulticlassClassificationEvaluator(
            labelCol="pred", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))
Example #26
0
def gbdt_core(df, condition):
    """
    gdbt二分类核心函数
    :param spark_session:
    :param df:
    :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 0.1, "maxDepth": 5, "minInstancesPerNode": 1, "seed": 1}
    :return:
    """

    # 参数
    label_index = condition['label']  # 标签列(列名或列号)
    feature_indexs = condition['features']  # 特征列(列名或列号)
    iterations = condition['iterations']  # 迭代次数
    step = condition['step']  # 学习速率(0-1)
    max_depth = condition['maxDepth']  # 数的最大深度[1,100]
    minInstancesPerNode = condition['minInstancesPerNode']  # 叶子节点最少样本数[1,1000]
    seed = condition['seed']  # 随机数产生器种子[0,10]

    # 1. 准备数据
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=x[label_index], features=Vectors.dense(features_data))

    training_set = df.rdd.map(lambda x: func(x)).toDF()

    string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = string_indexer.fit(training_set)
    tf = si_model.transform(training_set)

    # 2. 训练
    gbdt = GBTClassifier(labelCol="indexed",
                         maxIter=iterations,
                         stepSize=step,
                         maxDepth=max_depth,
                         minInstancesPerNode=minInstancesPerNode,
                         seed=seed)
    gbdt_model = gbdt.fit(tf)
    print(gbdt_model.featureImportances)

    # 3.保存模型
    svm_model_path = model_url() + '/gbdt/' + str(uuid.uuid1())
    deltree(svm_model_path)  # 删除已经存在的模型
    gbdt_model.write().overwrite().save(svm_model_path)

    return svm_model_path
def _get_xgboost_classifier_model(col, train):
    '''
    Gradient Boosted Tree Classifier Model is created for predicting Missing Values
    '''
    print(
        'Using Gradient Boosted Regressor Module to predict Missing Values ...'
    )
    cla_model = GBTClassifier(labelCol=col)
    #params = ParamGridBuilder().addGrid(cla_model.maxDepth, [5, 10, 20]).\
    #                            addGrid(cla_model.minInfoGain, [0.0, 0.01, 1.0]).\
    #                            addGrid(cla_model.maxBins, [32, 20, 50, 100, 300]).build()
    #cv = CrossValidator(estimator=cla_model,
    #                   estimatorParamMaps=params,
    #                   evaluator=BinaryClassificationEvaluator(labelCol=col),
    #                   numFolds=10)
    cla_model = cla_model.fit(train)
    return cla_model
Example #28
0
def model_dev_gbm(df_train, df_test, max_depth, max_bins, max_iter):
    
    gbm_start_time = time()
    
    # Create an Initial Model Instance
    mod_gbm= GBTClassifier(labelCol='label',
                           featuresCol='features',
                           maxDepth=max_depth,
                           maxBins=max_bins,
                           maxIter=max_iter)
    
    # Training The Model
    gbm_final_model = mod_gbm.fit(df_train)
    
    # Scoring The Model On Test Sample
    gbm_transformed = gbm_final_model.transform(df_test)
    gbm_test_results = gbm_transformed.select(['prediction', 'label'])
    gbm_predictionAndLabels= gbm_test_results.rdd
    gbm_test_metrics = MulticlassMetrics(gbm_predictionAndLabels)
    
    # Collecting The Model Statistics
    gbm_cm=gbm_test_metrics.confusionMatrix().toArray()
    gbm_accuracy=round(float((gbm_cm[0][0]+gbm_cm[1][1])/gbm_cm.sum())*100,2)
    gbm_precision=round(float((gbm_cm[0][0])/(gbm_cm[0][0]+gbm_cm[1][0]))*100,2)
    gbm_recall=round(float((gbm_cm[0][0])/(gbm_cm[0][0]+gbm_cm[0][1]))*100,2)
    gbm_auc = round(float(BinaryClassificationMetrics(gbm_predictionAndLabels).areaUnderROC)*100,2)
    
    # Printing The Model Statitics
    print("\n++++++ Printing GBM Model Accuracy ++++++\n")
    print("Accuracy: "+str(gbm_accuracy)+"%")
    print("AUC: "+str(gbm_auc)+"%")
    print("Precision: "+str(gbm_precision)+"%")
    print("Recall: "+str(gbm_recall)+"%")
    gbm_end_time = time()
    
    gbm_elapsed_time = (gbm_end_time - gbm_start_time)/60
    gbm_model_stat = pd.DataFrame({"Model Name" : ["Gradient Boosting Machine"],
                                  "Accuracy" : gbm_accuracy,
                                  "AUC": gbm_auc, 
                                  "Precision": gbm_precision,
                                  "Recall": gbm_recall, 
                                  "Time (Min.)": round(gbm_elapsed_time,3)})
    gbm_output = (gbm_final_model,gbm_model_stat,gbm_cm)
    print("Time To Build GBM Model: %.3f Minutes" % gbm_elapsed_time)
    
    return(gbm_output)
    def pipeline(self, modeling_code, classifiers_metadata):
        spark_session = (
            SparkSession
                .builder
                .appName("modelBuilder")
                .config("spark.driver.port", os.environ[SPARK_DRIVER_PORT])
                .config("spark.driver.host",
                        os.environ[MODEL_BUILDER_HOST_NAME])
                .config("spark.jars.packages",
                        "org.mongodb.spark:mongo-spark-connector_2.11:2.4.2",
                        )
                .config("spark.scheduler.mode", "FAIR")
                .config("spark.scheduler.pool", "modelBuilder")
                .config("spark.scheduler.allocation.file",
                        "./fairscheduler.xml")
                .master("spark://"
                        + os.environ[SPARKMASTER_HOST]
                        + ":"
                        + str(os.environ[SPARKMASTER_PORT])
                        )
                .getOrCreate()
        )

        (features_training, features_testing, features_evaluation) = \
            self.modeling_code_processing(
                modeling_code,
                spark_session)

        classifier_switcher = {
            "LR": LogisticRegression(),
            "DT": DecisionTreeClassifier(),
            "RF": RandomForestClassifier(),
            "GB": GBTClassifier(),
            "NB": NaiveBayes(),
        }
        classifier_threads = []

        for name, metadata in classifiers_metadata.items():
            classifier = classifier_switcher[name]
            classifier_threads.append(
                self.thread_pool.submit(
                    Model.classifier_processing,
                    classifier,
                    features_training,
                    features_testing,
                    features_evaluation,
                    metadata,
                )
            )

        for classifier in classifier_threads:
            testing_prediction, metadata_document = classifier.result()
            self.save_classifier_result(
                testing_prediction,
                metadata_document
            )

        spark_session.stop()
def estimators(config):
    # All models to choose amongst for simple regression/classification
    model_type = config['base']['model_type']    
    model = config['base']['model']
    if model == 'rf':
        if model_type == 'classification':
            glm = RandomForestClassifier(
                        featuresCol = config['base']['featuresCol'],
                        labelCol = config['base']['labelCol'],
                        predictionCol = config['base']['predictionCol'],
                        numTrees = config['model']['numTrees'],
                        maxDepth = config['model']['maxDepth']
                        )
        elif model_type == 'regression':
            glm = RandomForestRegressor(
                        featuresCol = config['base']['featuresCol'],
                        labelCol = config['base']['labelCol'],
                        predictionCol = config['base']['predictionCol'],
                        numTrees = config['model']['numTrees'],
                        maxDepth = config['model']['maxDepth']
                        )
    if model == 'gbm':
        if model_type == 'classification':
            glm = GBTClassifier(
                        featuresCol = config['base']['featuresCol'],
                        labelCol = config['base']['labelCol'],
                        predictionCol = config['base']['predictionCol'],
                        lossType = config['model']['lossType'],
                        maxDepth = config['model']['maxDepth'],
                        stepSize = config['model']['stepSize']
                        )
        elif model_type == 'regression':
            glm = GBTRegressor(
                        featuresCol = config['base']['featuresCol'],
                        labelCol = config['base']['labelCol'],
                        predictionCol = config['base']['predictionCol'],
                        lossType = config['model']['lossType'],
                        maxDepth = config['model']['maxDepth'],
                        stepSize = config['model']['stepSize']
                        )
    if model == 'logistic':
        glm = LogisticRegression(
                    featuresCol = config['base']['featuresCol'],
                    labelCol = config['base']['labelCol'],
                    predictionCol = config['base']['predictionCol'],
                    threshold = config['model']['threshold'],
                    regParam = config['model']['regParam'],
                    elasticNetParam = config['model']['elasticNetParam']
                    )
    if model == 'linear':
        glm = LinearRegression(
                    featuresCol = config['base']['featuresCol'],
                    labelCol = config['base']['labelCol'],
                    predictionCol = config['base']['predictionCol'],
                    regParam = config['model']['regParam'],
                    elasticNetParam = config['model']['elasticNetParam']
                    )
    return glm
Example #31
0
def predictions(train, test):
    #Aplicamos la tecnica de GBT
    GPT = GBTClassifier(featuresCol="Atributos", labelCol="Income", maxBins=41)
    GPT = GPT.fit(train)
    predictions = GPT.transform(test)
    results = predictions.select("Income", "prediction")
    predictionAndLabels = results.rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    cm = metrics.confusionMatrix().toArray()
    #Calculo de metricas
    accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
    precision = cm[0][0] / (cm[0][0] + cm[1][0])
    recall = cm[0][0] / (cm[0][0] + cm[0][1])
    f1 = 2 * ((precision * recall) / (precision + recall))
    print("Metricas del modelo GBT Classifier")
    print("accuracy = {0}, precision = {1}, recall = {2}, f1 = {3}".format(
        accuracy, precision, recall, f1))
    return
Example #32
0
    features=[]
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
        target = float(r[-1])
        ID=float(r[0])
        return target, Vectors.dense(features)
    except:
        return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
(trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3])

stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(trainingData)
td = si_model.transform(trainingData)

gbt = GBTClassifier(maxIter=100, maxDepth=10, labelCol="label")
model = gbt.fit(trainingData)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/gbt_100_20")

df1 = sqlContext.createDataFrame([
     Row(label=0.0, features=Vectors.dense([0.0, 0.0])),
     Row(label=0.0, features=Vectors.dense([0.0, 1.0])),
     Row(label=1.0, features=Vectors.dense([1.0, 0.0]))])