def gbtClassifier(train, test): gbt = GBTClassifier(maxIter=10) gbtModel = gbt.fit(train) predictions = gbtModel.transform(test) predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10) return predictions
def basic_example(spark, resources_folder): data = spark.read.format('libsvm').load(resources_folder + 'sample_libsvm_data.txt') data.printSchema() data.show() train_data, test_data = data.randomSplit([0.6, 0.4]) dtc = DecisionTreeClassifier() rfc = RandomForestClassifier() gbtc = GBTClassifier() dtc_model = dtc.fit(train_data) rfc_model = rfc.fit(train_data) gbtc_model = gbtc.fit(train_data) dtc_predictions = dtc_model.transform(test_data) rfc_predictions = rfc_model.transform(test_data) gbtc_predictions = gbtc_model.transform(test_data) dtc_predictions.show() rfc_predictions.show() # GBT No tiene rawPrediction Column, si esta haciendo un predictor de clasificacion binaria o multiclasificacion # puede que pida el rawPrediction como un input gbtc_predictions.show() acc_eval = MulticlassClassificationEvaluator(metricName='accuracy') print("DTC Accuracy") print(acc_eval.evaluate(dtc_predictions)) print("RFC Accuracy") print(acc_eval.evaluate(rfc_predictions)) print("GBTC Accuracy") print(acc_eval.evaluate(gbtc_predictions)) print(rfc_model.featureImportances)
def test_gbt_classifier(self): raw_data = self.spark.createDataFrame([ (1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], [])) ], ["label", "features"]) string_indexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = string_indexer.fit(raw_data) data = si_model.transform(raw_data) gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml GBT Classifier', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTClassifier") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def exec_gradient_boost(self, featuresCol1="features", labelCol1="label", predictionCol1="prediction", maxIter1=30, numClass1=2): ''' Creates the Gradient Boost model Pipeline, this model is only applicable to binary classification problems Input: featureCol1: feature column name, labelCol: label column name, predictionCol1: prediction column name model parameters: {maximum number of iterations}, numClass1: number of class labels restricted to 2 Output: None ''' #Initialize GradientBoost Model with parameters passed gb = GBTClassifier(featuresCol=featuresCol1, labelCol=labelCol1, predictionCol=predictionCol1, maxIter=maxIter1) #Fit gradient boost model with training data gbModel = gb.fit(self.trainingData) #Make nb model predictions on testData predictions = gbModel.transform(self.testData) #Evaluate the results generated by the model prediction self.model_evaluator(predictions, modelType="GradientBoost Model", modelParams=str({'maxIter': maxIter1}), numClass=numClass1)
def gradient_boosted_tree_classifier(training_data, test_data, validation_data): # ROC: 0.71 gbt = GBTClassifier(featuresCol='scaled_features', labelCol='label', maxIter=10) gbtModel = gbt.fit(training_data) predict_valid = gbtModel.transform(validation_data) # predict_train = gbtModel.transform(training_data) predict_valid.show(5) evaluate_metrics(predict_valid) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label', metricName="areaUnderROC") model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_valid, data_type="valid_data") paramGrid = (ParamGridBuilder() .addGrid(gbt.maxDepth, [2, 4, 6]) .addGrid(gbt.maxBins, [20, 60]) .addGrid(gbt.maxIter, [10, 20]) .build()) cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations. cvModel = cv.fit(training_data) predict_cross_valid = cvModel.transform(validation_data) model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_cross_valid, data_type="valid_data") predict_final = cvModel.bestModel.transform(test_data) model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_final, data_type="test_data")
def evaluateGradientBoostTree(trainDF ,testDF): for stepsize in [0.01 ,0.1 ,1]: classifier = GBTClassifier(stepSize=stepsize) model = classifier.fit(trainDF) predictions = model.transform(testDF) print("Gradient Boost Tree with stepsize : {}".format(stepsize)) printevaluatation(model,predictions)
def gbt(df, columns, input_col, **kwargs): """ Runs a gradient boosting tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with gradient boosting tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) if not is_str(input_col): raise TypeError("Error, input column must be a string") data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats, output_col="features") model = GBTClassifier(**kwargs) df = df.cols.rename(name_col(input_col, "index_to_string"), "label") gbt_model = model.fit(df) df_model = gbt_model.transform(df) return df_model, gbt_model
def gbt(df, columns, input_col): """ Runs a gradient boosting tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with gradient boosting tree and prediction run. """ assert_spark_df(df) assert isinstance(columns, list), "Error, columns must be a list" assert isinstance(input_col, str), "Error, input column must be a string" data = df.select(columns) feats = data.columns feats.remove(input_col) transformer = op.DataFrameTransformer(data) transformer.string_to_index(input_cols=input_col) transformer.vector_assembler(input_cols=feats) model = GBTClassifier() transformer.rename_col(columns=[(input_col + "_index", "label")]) gbt_model = model.fit(transformer.df) df_model = gbt_model.transform(transformer.df) return df_model, gbt_model
def gbdtClassification(df,arguments): from pyspark.ml.classification import GBTClassifier numTrees = 20 stepSize = 0.1 maxDepth = 5 minInstancesPerNode = 1 if arguments.maxDepth != None: maxDepth = float(arguments.maxDepth) if arguments.minInstancesPerNode != None: minInstancesPerNode = float(arguments.minInstancesPerNode) if arguments.numTrees != None: numTrees = float(arguments.numTrees) if arguments.stepSize != None: stepSize = float(arguments.stepSize) gbdt = GBTClassifier(maxIter=numTrees, stepSize=stepSize, maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode) model = gbdt.fit(df) return model
def run_gradient_boost(tn_data, ts_data): gbt = GBTClassifier(featuresCol="scaled_features", labelCol="output", predictionCol="prediction") gbtModel = gbt.fit(tn_data) predictions = gbtModel.transform(ts_data) print_perf_eval(predictions)
def transform_predictions(dataframe, spark): df_transformed = dataframe.drop("Patient addmited to regular ward (1=yes, 0=no)", "Patient addmited to semi-intensive unit (1=yes, 0=no)", "Patient addmited to intensive care unit (1=yes, 0=no)") df_transformed_no_missing = dismiss_missing_values(df_transformed) # build the dataset to be used as a rf_model base outcome_features = ["SARS-Cov-2 exam result"] required_features = ['Hemoglobin', 'Hematocrit', 'Platelets', 'Eosinophils', 'Red blood Cells', 'Lymphocytes', 'Leukocytes', 'Basophils', 'Monocytes'] assembler = VectorAssembler(inputCols=required_features, outputCol='features') model_data = assembler.transform(df_transformed_no_missing) # split the dataset into train/test subgroups (training_data, test_data) = model_data.randomSplit([0.8, 0.2], seed=2020) # Random Forest classifier rf = RandomForestClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features', maxDepth=5) rf_model = rf.fit(training_data) rf_predictions = rf_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') rf_accuracy = multi_evaluator.evaluate(rf_predictions) # Decision Tree Classifier dt = DecisionTreeClassifier(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxDepth=3) dt_model = dt.fit(training_data) dt_predictions = dt_model.transform(test_data) dt_predictions.select(outcome_features + required_features).show(10) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') dt_accuracy = multi_evaluator.evaluate(dt_predictions) # Logistic Regression Model lr = LogisticRegression(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxIter=10) lr_model = lr.fit(training_data) lr_predictions = lr_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') lr_accuracy = multi_evaluator.evaluate(lr_predictions) # Gradient-boosted Tree classifier Model gb = GBTClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features') gb_model = gb.fit(training_data) gb_predictions = gb_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') gb_accuracy = multi_evaluator.evaluate(gb_predictions) rdd = spark.sparkContext.parallelize([rf_accuracy, dt_accuracy, lr_accuracy, gb_accuracy]) predictions_dataframe = spark.createDataFrame(rdd, FloatType()) return predictions_dataframe
def universities_example(spark, resources_folder): data = spark.read.csv(resources_folder + 'College.csv', header=True, inferSchema=True) data.printSchema() data.show() assembler = VectorAssembler(inputCols=[ 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad', 'Outstate' 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate' ], outputCol='features') data_assembled = assembler.transform(data) private_state_indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex') data_transformed = private_state_indexer.fit(data_assembled).transform( data_assembled) train_data, test_data = data_transformed.select( ['features', 'PrivateIndex']).randomSplit([0.6, 0.4]) dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features') rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features') gbtc = GBTClassifier(labelCol='PrivateIndex', featuresCol='features') dtc_college_model = dtc.fit(train_data) rfc_college_model = rfc.fit(train_data) gbtc_college_model = gbtc.fit(train_data) dtc_predictions = dtc_college_model.transform(test_data) rfc_predictions = rfc_college_model.transform(test_data) gbtc_predictions = gbtc_college_model.transform(test_data) my_binary_evaluator = BinaryClassificationEvaluator( labelCol='PrivateIndex') print("DTC Evaluator") print(my_binary_evaluator.evaluate(dtc_predictions)) print("RFC Evaluator") print(my_binary_evaluator.evaluate(rfc_predictions)) print("DTC Evaluator") my_binary_evaluator = BinaryClassificationEvaluator( labelCol='PrivateIndex', rawPredictionCol='prediction') print(my_binary_evaluator.evaluate(gbtc_predictions)) # No se puede hacer una evaluación del accuracy con un BinaryClassificationEvaluator para eso toca usar un # MulticlassClassificationEvaluator acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy') rfc_accuracy = acc_eval.evaluate(rfc_predictions) print(rfc_accuracy)
def testClassification(train, test): # Train a GradientBoostedTrees model. rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel") model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = BinaryClassificationMetrics(predictionAndLabels) print("AUC %.3f" % metrics.areaUnderROC)
def gradientBoosting(df, feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'], maxIter=20, stepSize=0.1): # Checks if there is a SparkContext running if so grab that if not start a new one # sc = SparkContext.getOrCreate() # sqlContext = SQLContext(sc) # sqlContext.setLogLevel('INFO') vector_assembler = VectorAssembler(inputCols=feature_list, outputCol="features") df_temp = vector_assembler.transform(df) df = df_temp.select(['label', 'features']).withColumnRenamed('label', 'label') (trainingData, testData) = df.randomSplit([0.7, 0.3]) gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10) model = gbt.fit(trainingData) predictions = model.transform(testData) #predictions.select("prediction", "label").show(40) evaluator = BinaryClassificationEvaluator(labelCol="label") # accuracy = evaluator.evaluate(predictions, {evaluator.metricName:"Precision"}) auc = evaluator.evaluate(predictions) # test distribution of outputs total = df.select('label').count() disk = df.filter(df.label == 0).count() cloud = df.filter(df.label == 1).count() # print outputs print('Gradient-Boosted Tree') print(' Cloud %{}'.format((cloud / total) * 100)) print(' Disk %{}'.format((disk / total) * 100)) print(feature_list) # print(" Test Error = {}".format((1.0 - accuracy) * 100)) # print(" Test Accuracy = {}\n".format(accuracy * 100)) print(" Test AUC = {}\n".format(auc * 100)) misses = predictions.filter(predictions.label != predictions.prediction) # now get percentage of error disk_misses = misses.filter(misses.label == 0).count() cloud_misses = misses.filter(misses.label == 1).count() print(' Cloud Misses %{}'.format((cloud_misses / cloud) * 100)) print(' Disk Misses %{}'.format((disk_misses / disk) * 100)) return auc, 'Gradient Boosted: {}'.format(auc), model
def gbt_classifier(training, testing): from pyspark.ml.classification import GBTClassifier # Train a GBT model. gbt = GBTClassifier(maxIter=10) # Train model. This also runs the indexers. gbt_model = gbt.fit(training) # Make predictions. gbt_predictions = gbt_model.transform(testing) #Evaluate model gbt_evaluator = MulticlassClassificationEvaluator(metricName='accuracy') gbt_accuracy = gbt_evaluator.evaluate(gbt_predictions) return gbt_accuracy
def main(): spark = SparkSession \ .builder \ .appName("RandomForest") \ .config("spark.executor.heartbeatInterval", "60s") \ .getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) sc.setLogLevel("INFO") # Loading the test data df_test = spark.read.parquet(sys.argv[1]) df_test, df_train = df_test.randomSplit([0.3, 0.7]) df_train_indexed=df_train.selectExpr("label as indexedLabel","features as indexedFeatures") df_test_indexed=df_test.selectExpr("label as indexedLabel","features as indexedFeatures") # # Load the model # rf_model = RandomForestClassificationModel.load(sys.argv[2]) # # # Make the predictions # predictions = rf_model.transform(df_test) gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=100,maxBins=24000000) model=gbt.fit(df_train_indexed) predictions = model.transform(df_test_indexed) # predictionsRDD=predictions.rdd # predictionsRDD.saveAsTextFile(sys.argv[3]+"output.text") evaluator_acc = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel", metricName="accuracy") accuracy = evaluator_acc.evaluate(predictions) print "accuracy *******************" print accuracy evaluator_pre = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel", metricName="weightedPrecision") print "precision *******************" print evaluator_pre.evaluate(predictions) print "recall **********************" print MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel", metricName="weightedRecall").evaluate(predictions)
def evaluateGradientBoostTree(trainDF ,testDF): """ Traning by gradient boost tree classifiers with some stepSize params it returns bechmarkdata list for stepSize params """ benchmarkData = [] for stepsize in [0.1]: classifier = GBTClassifier(stepSize=stepsize) model = classifier.fit(trainDF) predictions = model.transform(testDF) print("Gradient Boost Tree with stepsize : {}".format(stepsize)) accuracy = printevaluatation(model,predictions) benchmarkData += [("GBT", "sitepsize" ,stepsize ,float(accuracy))] return benchmarkData
def models(): rf_classifier = RandomForestClassifier(labelCol="label", featuresCol="features") print("Random Forest F1 = %g" % evaluate(rf_classifier)) lsvc = LinearSVC(maxIter=50) print("Linear SVC F1 = %g" % evaluate(lsvc)) gbt = GBTClassifier() print("GBT F1 = %g" % evaluate(gbt)) mlp = MultilayerPerceptronClassifier(seed=1234, featuresCol='features') print("MLP F1 = %g" % evaluate(mlp)) fm = FMClassifier() print('FM') evaluate(fm) featurize_lda() # NGrams # print("NGram Random Forest F1 = %g" % evaluate(rf_classifier, "ngrams")) # print("Ngram Linear SVC F1 = %g" % evaluate(lsvc, "ngrams")) # print("Ngram GBT F1 = %g" % evaluate(gbt, "ngrams")) # TF-IDF print("Ngram TF-IDF Random Forest F1 = %g" % evaluate(rf_classifier, "ngrams", "TF-IDF")) print("Ngram TF-IDF Linear SVC F1 = %g" % evaluate(lsvc, "ngrams", "TF-IDF")) print("Ngram TF-IDF GBT F1 = %g" % evaluate(gbt, "ngrams", "TF-IDF")) print("Words TF-IDF Random Forest F1 = %g" % evaluate(rf_classifier, "words", "TF-IDF")) print("Words TF-IDF Linear SVC F1 = %g" % evaluate(lsvc, "words", "TF-IDF")) print("Words TF-IDF GBT F1 = %g" % evaluate(gbt, "words", "TF-IDF"))
def myGBT(training, test, labelColumnName): # 设置默认分类器 gbt = GBTClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", cacheNodeIds=True) # 构造参数网格 paramGrid_gbt = ParamGridBuilder().addGrid( gbt.maxDepth, [5, 8, 10]).addGrid(gbt.minInfoGain, [0.0, 0.001]).addGrid( gbt.minInstancesPerNode, [1, 3]).addGrid(gbt.maxIter, [100, 150, 200]).addGrid(gbt.stepSize, [0.01, 0.1]).build() # modeling类调用方式 bestModel_gbt, best_epm_gbt, best_sampling_gbt = module.modeling()._fit( training, gbt, paramGrid_gbt, [0.2, 0.5, 0.8], 3) # 预测 all_list = training.columns all_list.remove(labelColumnName) assembler = VectorAssembler().setInputCols(all_list).setOutputCol( "features_vector") test = assembler.transform(test) predictions_gbt = bestModel_gbt.transform(test) # 混淆矩阵 predictions_gbt.groupBy('label', 'prediction').count().show() return predictions_gbt
def clf_gbt(feature, target): gbt = GBTClassifier(featuresCol=feature, labelCol=target, maxIter=10, seed=_seed, cacheNodeIds=True) paramGrid = ( ParamGridBuilder() .addGrid(gbt.maxDepth, [10,15,20]) .addGrid(gbt.stepSize, [.05,.1,.5]) .build()) return gbt, paramGrid
def __pipeline(self, modeling_code: str, classifiers_metadata: dict, database_url_training: str, database_url_test: str) -> None: (features_training, features_testing, features_evaluation) = \ self.__modeling_code_processing( modeling_code, self.__spark_session, database_url_training, database_url_test) classifier_switcher = { "LR": LogisticRegression(), "DT": DecisionTreeClassifier(), "RF": RandomForestClassifier(), "GB": GBTClassifier(), "NB": NaiveBayes(), } classifier_threads = [] for name, metadata in classifiers_metadata.items(): classifier = classifier_switcher[name] classifier_threads.append( self.__thread_pool.submit( self.__classifier_processing, classifier, features_training, features_testing, features_evaluation, metadata, )) for classifier in classifier_threads: testing_prediction, metadata_document = classifier.result() self.__save_classifier_result(testing_prediction, metadata_document)
def Distr_GBTClassifier(xy_train, xy_test): gf = GBTClassifier(minInstancesPerNode=20, maxDepth=25) evalu = BinaryClassificationEvaluator() grid_1 = ParamGridBuilder()\ .addGrid(gf.maxIter, [100])\ .addGrid(gf.subsamplingRate, [0.5,0.8,1.0])\ .build() cv_1 = CrossValidator(estimator=gf, estimatorParamMaps=grid_1, evaluator=evalu, numFolds=5) #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型 cvModel_1 = cv_1.fit(xy_train) print "Grid scores: " best_params_1 = Get_best_params(cvModel_1)['subsamplingRate'] grid = ParamGridBuilder()\ .addGrid(gf.maxIter, [300,500])\ .addGrid(gf.subsamplingRate, [best_params_1,])\ .build() cv = CrossValidator(estimator=gf, estimatorParamMaps=grid, evaluator=evalu, numFolds=5) #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型 cvModel = cv.fit(xy_train) best_params = Get_best_params(cvModel) print "Best parameters set found: %s" % best_params return cvModel.bestModel
def gbtc(self, maxIter=10): self.time_calc.start_time('\nGradient-boosted tree classifier') gbtc = GBTClassifier(labelCol=self.label_col, featuresCol=self.features_col, maxIter=maxIter) self.classify('gbtc', gbtc, True) self.time_calc.end_time('Gradient-boosted tree classifier')
def _spark_rf(self): self.df = spark.createDataFrame(self.data) features = [] for col in self.df.columns: if col == 'pred': continue else: features.append(col) (trainingData, testData) = self.df.randomSplit([0.7, 0.3], seed=24234232) assembler = VectorAssembler(inputCols=features, outputCol="features") #rf = RandomForestClassifier(labelCol="pred", featuresCol="features", numTrees=500) gbt = gbt = GBTClassifier(labelCol="pred", featuresCol="features", maxIter=200) pipeline = Pipeline(stages=[assembler, gbt]) model = pipeline.fit(trainingData) predictions = model.transform(testData) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="pred", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
def gbdt_core(df, condition): """ gdbt二分类核心函数 :param spark_session: :param df: :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 0.1, "maxDepth": 5, "minInstancesPerNode": 1, "seed": 1} :return: """ # 参数 label_index = condition['label'] # 标签列(列名或列号) feature_indexs = condition['features'] # 特征列(列名或列号) iterations = condition['iterations'] # 迭代次数 step = condition['step'] # 学习速率(0-1) max_depth = condition['maxDepth'] # 数的最大深度[1,100] minInstancesPerNode = condition['minInstancesPerNode'] # 叶子节点最少样本数[1,1000] seed = condition['seed'] # 随机数产生器种子[0,10] # 1. 准备数据 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return Row(label=x[label_index], features=Vectors.dense(features_data)) training_set = df.rdd.map(lambda x: func(x)).toDF() string_indexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = string_indexer.fit(training_set) tf = si_model.transform(training_set) # 2. 训练 gbdt = GBTClassifier(labelCol="indexed", maxIter=iterations, stepSize=step, maxDepth=max_depth, minInstancesPerNode=minInstancesPerNode, seed=seed) gbdt_model = gbdt.fit(tf) print(gbdt_model.featureImportances) # 3.保存模型 svm_model_path = model_url() + '/gbdt/' + str(uuid.uuid1()) deltree(svm_model_path) # 删除已经存在的模型 gbdt_model.write().overwrite().save(svm_model_path) return svm_model_path
def _get_xgboost_classifier_model(col, train): ''' Gradient Boosted Tree Classifier Model is created for predicting Missing Values ''' print( 'Using Gradient Boosted Regressor Module to predict Missing Values ...' ) cla_model = GBTClassifier(labelCol=col) #params = ParamGridBuilder().addGrid(cla_model.maxDepth, [5, 10, 20]).\ # addGrid(cla_model.minInfoGain, [0.0, 0.01, 1.0]).\ # addGrid(cla_model.maxBins, [32, 20, 50, 100, 300]).build() #cv = CrossValidator(estimator=cla_model, # estimatorParamMaps=params, # evaluator=BinaryClassificationEvaluator(labelCol=col), # numFolds=10) cla_model = cla_model.fit(train) return cla_model
def model_dev_gbm(df_train, df_test, max_depth, max_bins, max_iter): gbm_start_time = time() # Create an Initial Model Instance mod_gbm= GBTClassifier(labelCol='label', featuresCol='features', maxDepth=max_depth, maxBins=max_bins, maxIter=max_iter) # Training The Model gbm_final_model = mod_gbm.fit(df_train) # Scoring The Model On Test Sample gbm_transformed = gbm_final_model.transform(df_test) gbm_test_results = gbm_transformed.select(['prediction', 'label']) gbm_predictionAndLabels= gbm_test_results.rdd gbm_test_metrics = MulticlassMetrics(gbm_predictionAndLabels) # Collecting The Model Statistics gbm_cm=gbm_test_metrics.confusionMatrix().toArray() gbm_accuracy=round(float((gbm_cm[0][0]+gbm_cm[1][1])/gbm_cm.sum())*100,2) gbm_precision=round(float((gbm_cm[0][0])/(gbm_cm[0][0]+gbm_cm[1][0]))*100,2) gbm_recall=round(float((gbm_cm[0][0])/(gbm_cm[0][0]+gbm_cm[0][1]))*100,2) gbm_auc = round(float(BinaryClassificationMetrics(gbm_predictionAndLabels).areaUnderROC)*100,2) # Printing The Model Statitics print("\n++++++ Printing GBM Model Accuracy ++++++\n") print("Accuracy: "+str(gbm_accuracy)+"%") print("AUC: "+str(gbm_auc)+"%") print("Precision: "+str(gbm_precision)+"%") print("Recall: "+str(gbm_recall)+"%") gbm_end_time = time() gbm_elapsed_time = (gbm_end_time - gbm_start_time)/60 gbm_model_stat = pd.DataFrame({"Model Name" : ["Gradient Boosting Machine"], "Accuracy" : gbm_accuracy, "AUC": gbm_auc, "Precision": gbm_precision, "Recall": gbm_recall, "Time (Min.)": round(gbm_elapsed_time,3)}) gbm_output = (gbm_final_model,gbm_model_stat,gbm_cm) print("Time To Build GBM Model: %.3f Minutes" % gbm_elapsed_time) return(gbm_output)
def pipeline(self, modeling_code, classifiers_metadata): spark_session = ( SparkSession .builder .appName("modelBuilder") .config("spark.driver.port", os.environ[SPARK_DRIVER_PORT]) .config("spark.driver.host", os.environ[MODEL_BUILDER_HOST_NAME]) .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.2", ) .config("spark.scheduler.mode", "FAIR") .config("spark.scheduler.pool", "modelBuilder") .config("spark.scheduler.allocation.file", "./fairscheduler.xml") .master("spark://" + os.environ[SPARKMASTER_HOST] + ":" + str(os.environ[SPARKMASTER_PORT]) ) .getOrCreate() ) (features_training, features_testing, features_evaluation) = \ self.modeling_code_processing( modeling_code, spark_session) classifier_switcher = { "LR": LogisticRegression(), "DT": DecisionTreeClassifier(), "RF": RandomForestClassifier(), "GB": GBTClassifier(), "NB": NaiveBayes(), } classifier_threads = [] for name, metadata in classifiers_metadata.items(): classifier = classifier_switcher[name] classifier_threads.append( self.thread_pool.submit( Model.classifier_processing, classifier, features_training, features_testing, features_evaluation, metadata, ) ) for classifier in classifier_threads: testing_prediction, metadata_document = classifier.result() self.save_classifier_result( testing_prediction, metadata_document ) spark_session.stop()
def estimators(config): # All models to choose amongst for simple regression/classification model_type = config['base']['model_type'] model = config['base']['model'] if model == 'rf': if model_type == 'classification': glm = RandomForestClassifier( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], numTrees = config['model']['numTrees'], maxDepth = config['model']['maxDepth'] ) elif model_type == 'regression': glm = RandomForestRegressor( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], numTrees = config['model']['numTrees'], maxDepth = config['model']['maxDepth'] ) if model == 'gbm': if model_type == 'classification': glm = GBTClassifier( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], lossType = config['model']['lossType'], maxDepth = config['model']['maxDepth'], stepSize = config['model']['stepSize'] ) elif model_type == 'regression': glm = GBTRegressor( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], lossType = config['model']['lossType'], maxDepth = config['model']['maxDepth'], stepSize = config['model']['stepSize'] ) if model == 'logistic': glm = LogisticRegression( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], threshold = config['model']['threshold'], regParam = config['model']['regParam'], elasticNetParam = config['model']['elasticNetParam'] ) if model == 'linear': glm = LinearRegression( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], regParam = config['model']['regParam'], elasticNetParam = config['model']['elasticNetParam'] ) return glm
def predictions(train, test): #Aplicamos la tecnica de GBT GPT = GBTClassifier(featuresCol="Atributos", labelCol="Income", maxBins=41) GPT = GPT.fit(train) predictions = GPT.transform(test) results = predictions.select("Income", "prediction") predictionAndLabels = results.rdd metrics = MulticlassMetrics(predictionAndLabels) cm = metrics.confusionMatrix().toArray() #Calculo de metricas accuracy = (cm[0][0] + cm[1][1]) / cm.sum() precision = cm[0][0] / (cm[0][0] + cm[1][0]) recall = cm[0][0] / (cm[0][0] + cm[0][1]) f1 = 2 * ((precision * recall) / (precision + recall)) print("Metricas del modelo GBT Classifier") print("accuracy = {0}, precision = {1}, recall = {2}, f1 = {3}".format( accuracy, precision, recall, f1)) return
features=[] try: fe = r[1:-1] for i in range(len(fe)): features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D) target = float(r[-1]) ID=float(r[0]) return target, Vectors.dense(features) except: return (0.0,[0.0]*1932) new_rdd = rdd.filter(lambda i : len(i)==1934) rdd_after_trans = new_rdd.map(helper1) rdd_after_trans.cache() df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"]) (trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(trainingData) td = si_model.transform(trainingData) gbt = GBTClassifier(maxIter=100, maxDepth=10, labelCol="label") model = gbt.fit(trainingData) result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0])) result.saveAsTextFile("/user/demo/gbt_100_20") df1 = sqlContext.createDataFrame([ Row(label=0.0, features=Vectors.dense([0.0, 0.0])), Row(label=0.0, features=Vectors.dense([0.0, 1.0])), Row(label=1.0, features=Vectors.dense([1.0, 0.0]))])