Beispiel #1
0
def basic_example(spark, resources_folder):
    data = spark.read.format('libsvm').load(resources_folder +
                                            'sample_libsvm_data.txt')
    data.printSchema()
    data.show()
    train_data, test_data = data.randomSplit([0.6, 0.4])
    dtc = DecisionTreeClassifier()
    rfc = RandomForestClassifier()
    gbtc = GBTClassifier()

    dtc_model = dtc.fit(train_data)
    rfc_model = rfc.fit(train_data)
    gbtc_model = gbtc.fit(train_data)

    dtc_predictions = dtc_model.transform(test_data)
    rfc_predictions = rfc_model.transform(test_data)
    gbtc_predictions = gbtc_model.transform(test_data)

    dtc_predictions.show()
    rfc_predictions.show()
    # GBT No tiene rawPrediction Column, si esta haciendo un predictor de clasificacion binaria o multiclasificacion
    # puede que pida el rawPrediction como un input
    gbtc_predictions.show()

    acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
    print("DTC Accuracy")
    print(acc_eval.evaluate(dtc_predictions))
    print("RFC Accuracy")
    print(acc_eval.evaluate(rfc_predictions))
    print("GBTC Accuracy")
    print(acc_eval.evaluate(gbtc_predictions))

    print(rfc_model.featureImportances)
def gradient_boosted_tree_classifier(training_data, test_data, validation_data):
    # ROC: 0.71
    gbt = GBTClassifier(featuresCol='scaled_features', labelCol='label', maxIter=10)
    gbtModel = gbt.fit(training_data)

    predict_valid = gbtModel.transform(validation_data)
    # predict_train = gbtModel.transform(training_data)
    predict_valid.show(5)

    evaluate_metrics(predict_valid)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label',
                                              metricName="areaUnderROC")

    model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_valid,
                    data_type="valid_data")

    paramGrid = (ParamGridBuilder()
                 .addGrid(gbt.maxDepth, [2, 4, 6])
                 .addGrid(gbt.maxBins, [20, 60])
                 .addGrid(gbt.maxIter, [10, 20])
                 .build())

    cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
    # Run cross validations.
    cvModel = cv.fit(training_data)
    predict_cross_valid = cvModel.transform(validation_data)

    model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_cross_valid,
                    data_type="valid_data")

    predict_final = cvModel.bestModel.transform(test_data)

    model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_final,
                    data_type="test_data")
Beispiel #3
0
    def gbt(df, columns, input_col, **kwargs):
        """
        Runs a gradient boosting tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with gradient boosting tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats, output_col="features")

        model = GBTClassifier(**kwargs)

        df = df.cols.rename(name_col(input_col, "index_to_string"), "label")

        gbt_model = model.fit(df)
        df_model = gbt_model.transform(df)
        return df_model, gbt_model
Beispiel #4
0
def gbtClassifier(train, test):
    gbt = GBTClassifier(maxIter=10)
    gbtModel = gbt.fit(train)
    predictions = gbtModel.transform(test)
    predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction',
                       'probability').show(10)
    return predictions
Beispiel #5
0
def gbt(df, columns, input_col):
    """
    Runs a gradient boosting tree classifier for input DataFrame.
    :param df: Pyspark dataframe to analyze.
    :param columns: List of columns to select for prediction.
    :param input_col: Column to predict.
    :return: DataFrame with gradient boosting tree and prediction run.
    """

    assert_spark_df(df)

    assert isinstance(columns, list), "Error, columns must be a list"

    assert isinstance(input_col, str), "Error, input column must be a string"

    data = df.select(columns)
    feats = data.columns
    feats.remove(input_col)
    transformer = op.DataFrameTransformer(data)
    transformer.string_to_index(input_cols=input_col)
    transformer.vector_assembler(input_cols=feats)
    model = GBTClassifier()
    transformer.rename_col(columns=[(input_col + "_index", "label")])
    gbt_model = model.fit(transformer.df)
    df_model = gbt_model.transform(transformer.df)
    return df_model, gbt_model
Beispiel #6
0
def gbdtClassification(df,arguments):
	from pyspark.ml.classification import GBTClassifier
	numTrees = 20
	stepSize = 0.1
	maxDepth = 5
	minInstancesPerNode = 1

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.numTrees != None:
		numTrees = float(arguments.numTrees)

	if arguments.stepSize != None:
		stepSize = float(arguments.stepSize)

	gbdt = GBTClassifier(maxIter=numTrees,
						 stepSize=stepSize,
						 maxDepth=maxDepth,
						 minInstancesPerNode=minInstancesPerNode)
	model = gbdt.fit(df)

	return model
Beispiel #7
0
    def exec_gradient_boost(self,
                            featuresCol1="features",
                            labelCol1="label",
                            predictionCol1="prediction",
                            maxIter1=30,
                            numClass1=2):
        '''
        Creates the Gradient Boost model Pipeline, this model is only applicable to binary classification problems
        Input: featureCol1: feature column name, labelCol: label column name, predictionCol1: prediction column name
                            model parameters: {maximum number of iterations}, numClass1: number of class labels restricted to 2
        Output: None
        '''
        #Initialize GradientBoost Model with parameters passed
        gb = GBTClassifier(featuresCol=featuresCol1,
                           labelCol=labelCol1,
                           predictionCol=predictionCol1,
                           maxIter=maxIter1)

        #Fit gradient boost model with training data
        gbModel = gb.fit(self.trainingData)

        #Make nb model predictions on testData
        predictions = gbModel.transform(self.testData)

        #Evaluate the results generated by the model prediction
        self.model_evaluator(predictions,
                             modelType="GradientBoost Model",
                             modelParams=str({'maxIter': maxIter1}),
                             numClass=numClass1)
Beispiel #8
0
def evaluateGradientBoostTree(trainDF ,testDF):
    for stepsize in [0.01 ,0.1 ,1]:
        classifier = GBTClassifier(stepSize=stepsize)
        model = classifier.fit(trainDF)
        predictions = model.transform(testDF)
        print("Gradient Boost Tree with stepsize : {}".format(stepsize))
        printevaluatation(model,predictions)
 def test_gbt_classifier(self):
     raw_data = self.spark.createDataFrame([
         (1.0, Vectors.dense(1.0)),
         (0.0, Vectors.sparse(1, [], []))
     ], ["label", "features"])
     string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
     si_model = string_indexer.fit(raw_data)
     data = si_model.transform(raw_data)
     gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(model, 'Sparkml GBT Classifier', [
         ('features', FloatTensorType([1, feature_count]))
     ], spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np, expected, model, model_onnx,
                                 basename="SparkmlGBTClassifier")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path)
     compare_results(expected, output, decimal=5)
def run_gradient_boost(tn_data, ts_data):
    gbt = GBTClassifier(featuresCol="scaled_features",
                        labelCol="output",
                        predictionCol="prediction")
    gbtModel = gbt.fit(tn_data)
    predictions = gbtModel.transform(ts_data)

    print_perf_eval(predictions)
Beispiel #11
0
def transform_predictions(dataframe, spark):
    df_transformed = dataframe.drop("Patient addmited to regular ward (1=yes, 0=no)",
                                    "Patient addmited to semi-intensive unit (1=yes, 0=no)",
                                    "Patient addmited to intensive care unit (1=yes, 0=no)")

    df_transformed_no_missing = dismiss_missing_values(df_transformed)

    # build the dataset to be used as a rf_model base
    outcome_features = ["SARS-Cov-2 exam result"]
    required_features = ['Hemoglobin', 'Hematocrit', 'Platelets', 'Eosinophils', 'Red blood Cells', 'Lymphocytes',
                         'Leukocytes', 'Basophils', 'Monocytes']

    assembler = VectorAssembler(inputCols=required_features, outputCol='features')
    model_data = assembler.transform(df_transformed_no_missing)

    # split the dataset into train/test subgroups
    (training_data, test_data) = model_data.randomSplit([0.8, 0.2], seed=2020)

    # Random Forest classifier
    rf = RandomForestClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features', maxDepth=5)
    rf_model = rf.fit(training_data)
    rf_predictions = rf_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    rf_accuracy = multi_evaluator.evaluate(rf_predictions)

    # Decision Tree Classifier
    dt = DecisionTreeClassifier(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxDepth=3)
    dt_model = dt.fit(training_data)
    dt_predictions = dt_model.transform(test_data)
    dt_predictions.select(outcome_features + required_features).show(10)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    dt_accuracy = multi_evaluator.evaluate(dt_predictions)

    # Logistic Regression Model
    lr = LogisticRegression(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxIter=10)
    lr_model = lr.fit(training_data)
    lr_predictions = lr_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    lr_accuracy = multi_evaluator.evaluate(lr_predictions)

    # Gradient-boosted Tree classifier Model
    gb = GBTClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features')
    gb_model = gb.fit(training_data)
    gb_predictions = gb_model.transform(test_data)

    multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy')
    gb_accuracy = multi_evaluator.evaluate(gb_predictions)

    rdd = spark.sparkContext.parallelize([rf_accuracy, dt_accuracy, lr_accuracy, gb_accuracy])
    predictions_dataframe = spark.createDataFrame(rdd, FloatType())

    return predictions_dataframe
Beispiel #12
0
def universities_example(spark, resources_folder):
    data = spark.read.csv(resources_folder + 'College.csv',
                          header=True,
                          inferSchema=True)
    data.printSchema()
    data.show()

    assembler = VectorAssembler(inputCols=[
        'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad',
        'P_Undergrad', 'Outstate'
        'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio',
        'perc_alumni', 'Expend', 'Grad_Rate'
    ],
                                outputCol='features')
    data_assembled = assembler.transform(data)
    private_state_indexer = StringIndexer(inputCol='Private',
                                          outputCol='PrivateIndex')
    data_transformed = private_state_indexer.fit(data_assembled).transform(
        data_assembled)

    train_data, test_data = data_transformed.select(
        ['features', 'PrivateIndex']).randomSplit([0.6, 0.4])

    dtc = DecisionTreeClassifier(labelCol='PrivateIndex',
                                 featuresCol='features')
    rfc = RandomForestClassifier(labelCol='PrivateIndex',
                                 featuresCol='features')
    gbtc = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

    dtc_college_model = dtc.fit(train_data)
    rfc_college_model = rfc.fit(train_data)
    gbtc_college_model = gbtc.fit(train_data)

    dtc_predictions = dtc_college_model.transform(test_data)
    rfc_predictions = rfc_college_model.transform(test_data)
    gbtc_predictions = gbtc_college_model.transform(test_data)

    my_binary_evaluator = BinaryClassificationEvaluator(
        labelCol='PrivateIndex')
    print("DTC Evaluator")
    print(my_binary_evaluator.evaluate(dtc_predictions))
    print("RFC Evaluator")
    print(my_binary_evaluator.evaluate(rfc_predictions))
    print("DTC Evaluator")
    my_binary_evaluator = BinaryClassificationEvaluator(
        labelCol='PrivateIndex', rawPredictionCol='prediction')
    print(my_binary_evaluator.evaluate(gbtc_predictions))

    # No se puede hacer una evaluación del accuracy con un BinaryClassificationEvaluator para eso toca usar un
    # MulticlassClassificationEvaluator
    acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',
                                                 metricName='accuracy')
    rfc_accuracy = acc_eval.evaluate(rfc_predictions)
    print(rfc_accuracy)
def testClassification(train, test):
    # Train a GradientBoostedTrees model.

    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = BinaryClassificationMetrics(predictionAndLabels)
    print("AUC %.3f" % metrics.areaUnderROC)
def testClassification(train, test):
    # Train a GradientBoostedTrees model.

    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = BinaryClassificationMetrics(predictionAndLabels)
    print("AUC %.3f" % metrics.areaUnderROC)
def gradientBoosting(df,
                     feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'],
                     maxIter=20,
                     stepSize=0.1):
    # Checks if there is a SparkContext running if so grab that if not start a new one
    # sc = SparkContext.getOrCreate()
    # sqlContext = SQLContext(sc)
    # sqlContext.setLogLevel('INFO')

    vector_assembler = VectorAssembler(inputCols=feature_list,
                                       outputCol="features")
    df_temp = vector_assembler.transform(df)

    df = df_temp.select(['label',
                         'features']).withColumnRenamed('label', 'label')

    (trainingData, testData) = df.randomSplit([0.7, 0.3])

    gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
    model = gbt.fit(trainingData)

    predictions = model.transform(testData)
    #predictions.select("prediction", "label").show(40)
    evaluator = BinaryClassificationEvaluator(labelCol="label")
    # accuracy = evaluator.evaluate(predictions, {evaluator.metricName:"Precision"})
    auc = evaluator.evaluate(predictions)

    # test distribution of outputs
    total = df.select('label').count()
    disk = df.filter(df.label == 0).count()
    cloud = df.filter(df.label == 1).count()

    # print outputs
    print('Gradient-Boosted Tree')
    print(' Cloud %{}'.format((cloud / total) * 100))
    print(' Disk %{}'.format((disk / total) * 100))
    print(feature_list)

    # print(" Test Error = {}".format((1.0 - accuracy) * 100))
    # print(" Test Accuracy = {}\n".format(accuracy * 100))
    print(" Test AUC = {}\n".format(auc * 100))

    misses = predictions.filter(predictions.label != predictions.prediction)
    # now get percentage of error
    disk_misses = misses.filter(misses.label == 0).count()
    cloud_misses = misses.filter(misses.label == 1).count()

    print(' Cloud Misses %{}'.format((cloud_misses / cloud) * 100))
    print(' Disk Misses %{}'.format((disk_misses / disk) * 100))

    return auc, 'Gradient Boosted: {}'.format(auc), model
Beispiel #16
0
def gbt_classifier(training, testing):
    from pyspark.ml.classification import GBTClassifier

    # Train a GBT model.
    gbt = GBTClassifier(maxIter=10)
    # Train model.  This also runs the indexers.
    gbt_model = gbt.fit(training)

    # Make predictions.
    gbt_predictions = gbt_model.transform(testing)

    #Evaluate model
    gbt_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    gbt_accuracy = gbt_evaluator.evaluate(gbt_predictions)
    return gbt_accuracy
def main():
    spark = SparkSession \
        .builder \
        .appName("RandomForest") \
        .config("spark.executor.heartbeatInterval", "60s") \
        .getOrCreate()

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    sc.setLogLevel("INFO")

    # Loading the test data
    df_test = spark.read.parquet(sys.argv[1])

    df_test, df_train = df_test.randomSplit([0.3, 0.7])
    df_train_indexed=df_train.selectExpr("label as indexedLabel","features as indexedFeatures")
    df_test_indexed=df_test.selectExpr("label as indexedLabel","features as indexedFeatures")

    # # Load the model
    # rf_model = RandomForestClassificationModel.load(sys.argv[2])
    #
    # # Make the predictions
    # predictions = rf_model.transform(df_test)
    gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=100,maxBins=24000000)
    model=gbt.fit(df_train_indexed)
    predictions = model.transform(df_test_indexed)

    # predictionsRDD=predictions.rdd

    # predictionsRDD.saveAsTextFile(sys.argv[3]+"output.text")

    evaluator_acc = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel",
                                                      metricName="accuracy")
    accuracy = evaluator_acc.evaluate(predictions)

    print "accuracy *******************"
    print accuracy

    evaluator_pre = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel",
                                                      metricName="weightedPrecision")

    print "precision *******************"
    print evaluator_pre.evaluate(predictions)

    print "recall **********************"
    print MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexedLabel",
                                            metricName="weightedRecall").evaluate(predictions)
def evaluateGradientBoostTree(trainDF ,testDF):
    """
   Traning by  gradient boost tree classifiers  with some stepSize 
   params
   it returns bechmarkdata list for  stepSize params
   """
  
    benchmarkData = []
    for stepsize in [0.1]:
        classifier = GBTClassifier(stepSize=stepsize)
        model = classifier.fit(trainDF)
        predictions = model.transform(testDF)
        print("Gradient Boost Tree with stepsize : {}".format(stepsize))
        accuracy =  printevaluatation(model,predictions)
        benchmarkData += [("GBT", "sitepsize" ,stepsize ,float(accuracy))]
    return benchmarkData
Beispiel #19
0
def gbdt_core(df, condition):
    """
    gdbt二分类核心函数
    :param spark_session:
    :param df:
    :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "step": 0.1, "maxDepth": 5, "minInstancesPerNode": 1, "seed": 1}
    :return:
    """

    # 参数
    label_index = condition['label']  # 标签列(列名或列号)
    feature_indexs = condition['features']  # 特征列(列名或列号)
    iterations = condition['iterations']  # 迭代次数
    step = condition['step']  # 学习速率(0-1)
    max_depth = condition['maxDepth']  # 数的最大深度[1,100]
    minInstancesPerNode = condition['minInstancesPerNode']  # 叶子节点最少样本数[1,1000]
    seed = condition['seed']  # 随机数产生器种子[0,10]

    # 1. 准备数据
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=x[label_index], features=Vectors.dense(features_data))

    training_set = df.rdd.map(lambda x: func(x)).toDF()

    string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = string_indexer.fit(training_set)
    tf = si_model.transform(training_set)

    # 2. 训练
    gbdt = GBTClassifier(labelCol="indexed",
                         maxIter=iterations,
                         stepSize=step,
                         maxDepth=max_depth,
                         minInstancesPerNode=minInstancesPerNode,
                         seed=seed)
    gbdt_model = gbdt.fit(tf)
    print(gbdt_model.featureImportances)

    # 3.保存模型
    svm_model_path = model_url() + '/gbdt/' + str(uuid.uuid1())
    deltree(svm_model_path)  # 删除已经存在的模型
    gbdt_model.write().overwrite().save(svm_model_path)

    return svm_model_path
def _get_xgboost_classifier_model(col, train):
    '''
    Gradient Boosted Tree Classifier Model is created for predicting Missing Values
    '''
    print(
        'Using Gradient Boosted Regressor Module to predict Missing Values ...'
    )
    cla_model = GBTClassifier(labelCol=col)
    #params = ParamGridBuilder().addGrid(cla_model.maxDepth, [5, 10, 20]).\
    #                            addGrid(cla_model.minInfoGain, [0.0, 0.01, 1.0]).\
    #                            addGrid(cla_model.maxBins, [32, 20, 50, 100, 300]).build()
    #cv = CrossValidator(estimator=cla_model,
    #                   estimatorParamMaps=params,
    #                   evaluator=BinaryClassificationEvaluator(labelCol=col),
    #                   numFolds=10)
    cla_model = cla_model.fit(train)
    return cla_model
Beispiel #21
0
def model_dev_gbm(df_train, df_test, max_depth, max_bins, max_iter):
    
    gbm_start_time = time()
    
    # Create an Initial Model Instance
    mod_gbm= GBTClassifier(labelCol='label',
                           featuresCol='features',
                           maxDepth=max_depth,
                           maxBins=max_bins,
                           maxIter=max_iter)
    
    # Training The Model
    gbm_final_model = mod_gbm.fit(df_train)
    
    # Scoring The Model On Test Sample
    gbm_transformed = gbm_final_model.transform(df_test)
    gbm_test_results = gbm_transformed.select(['prediction', 'label'])
    gbm_predictionAndLabels= gbm_test_results.rdd
    gbm_test_metrics = MulticlassMetrics(gbm_predictionAndLabels)
    
    # Collecting The Model Statistics
    gbm_cm=gbm_test_metrics.confusionMatrix().toArray()
    gbm_accuracy=round(float((gbm_cm[0][0]+gbm_cm[1][1])/gbm_cm.sum())*100,2)
    gbm_precision=round(float((gbm_cm[0][0])/(gbm_cm[0][0]+gbm_cm[1][0]))*100,2)
    gbm_recall=round(float((gbm_cm[0][0])/(gbm_cm[0][0]+gbm_cm[0][1]))*100,2)
    gbm_auc = round(float(BinaryClassificationMetrics(gbm_predictionAndLabels).areaUnderROC)*100,2)
    
    # Printing The Model Statitics
    print("\n++++++ Printing GBM Model Accuracy ++++++\n")
    print("Accuracy: "+str(gbm_accuracy)+"%")
    print("AUC: "+str(gbm_auc)+"%")
    print("Precision: "+str(gbm_precision)+"%")
    print("Recall: "+str(gbm_recall)+"%")
    gbm_end_time = time()
    
    gbm_elapsed_time = (gbm_end_time - gbm_start_time)/60
    gbm_model_stat = pd.DataFrame({"Model Name" : ["Gradient Boosting Machine"],
                                  "Accuracy" : gbm_accuracy,
                                  "AUC": gbm_auc, 
                                  "Precision": gbm_precision,
                                  "Recall": gbm_recall, 
                                  "Time (Min.)": round(gbm_elapsed_time,3)})
    gbm_output = (gbm_final_model,gbm_model_stat,gbm_cm)
    print("Time To Build GBM Model: %.3f Minutes" % gbm_elapsed_time)
    
    return(gbm_output)
Beispiel #22
0
def predictions(train, test):
    #Aplicamos la tecnica de GBT
    GPT = GBTClassifier(featuresCol="Atributos", labelCol="Income", maxBins=41)
    GPT = GPT.fit(train)
    predictions = GPT.transform(test)
    results = predictions.select("Income", "prediction")
    predictionAndLabels = results.rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    cm = metrics.confusionMatrix().toArray()
    #Calculo de metricas
    accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
    precision = cm[0][0] / (cm[0][0] + cm[1][0])
    recall = cm[0][0] / (cm[0][0] + cm[0][1])
    f1 = 2 * ((precision * recall) / (precision + recall))
    print("Metricas del modelo GBT Classifier")
    print("accuracy = {0}, precision = {1}, recall = {2}, f1 = {3}".format(
        accuracy, precision, recall, f1))
    return
Beispiel #23
0
def embedding(DF, inputCol="stemmed"):
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    word2vec = Word2Vec(inputCol=inputCol, outputCol='features')
    model = word2vec.fit(DF)
    resultDF = model.transform(DF)
    dftrain, dftest = resultDF.randomSplit([0.80, 0.20])
    gbt = GBTClassifier(maxDepth=5)
    model = gbt.fit(dftrain)
    predictions = model.transform(dftest)
    score = evaluator.evaluate(predictions)
    accuracy = predictions.filter(predictions.label == predictions.prediction
                                  ).count() / predictions.count()
    return {
        'embeddingmodel': model,
        'predictions': predictions,
        'areaUnderROC': score,
        'accuracy': accuracy
    }
Beispiel #24
0
def spark_gbdt(train_file, test_file, features_columns='userID'):
    from pyspark.ml.classification import GBTClassifier
    from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
    from pyspark.ml.pipeline import Pipeline
    sess = get_spark_sesssion()

    string_indexer = StringIndexer(inputCol="label", outputCol="idx_label")
    v_c = VectorAssembler(inputCols=['userID'], outputCol='v_userID')
    trans = Pipeline(stages=[string_indexer, v_c])

    gbdt = GBTClassifier(maxDepth=5,
                         labelCol="idx_label",
                         predictionCol="pred",
                         featuresCol='v_userID',
                         seed=42,
                         maxMemoryInMB=1024 * 10,
                         maxIter=4)

    train = sess.read.load(
        train_file,
        format='csv',
        header=True,
        inferSchema=True,
    )

    train_data = trans.fit(train).transform(train)
    model = gbdt.fit(train_data)
    model.write().overwrite().save('gbtc.model')
    # model = GBTClassifier.load('gbtc.model')
    print(model.featureImportances)

    test = sess.read.load(test_file,
                          format='csv',
                          header=True,
                          inferSchema=True)

    test_data = trans.fit(test).transform(test)
    predict = model.transform(test_data)
    predict.show()

    save_pandas(predict.select('instanceID', 'pred').toPandas(),
                'submission.gbdt.csv',
                index=False)
Beispiel #25
0
def gbdt(data, label_index, feature_indexs, project_url):

    # 2.构造训练数据集
    data_set = data.rdd.map(list)
    (train_data,
     test_data) = data_set.randomSplit([trainDataRatio, 1 - trainDataRatio])
    data.show()

    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=label_index, features=Vectors.dense(features_data))

    training_set = train_data.map(list).map(lambda x: func(x)).toDF()
    training_set.show()
    train_num = training_set.count()
    print("训练样本数:{}".format(train_num))

    # 3.使用GBDT进行训练
    string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = string_indexer.fit(training_set)
    tf = si_model.transform(training_set)

    gbdt = GBTClassifier(labelCol="indexed",
                         maxIter=maxIter,
                         stepSize=stepSize,
                         maxDepth=maxDepth,
                         minInstancesPerNode=minInstancesPerNode,
                         seed=seed)
    gbdt_model = gbdt.fit(tf)
    print(gbdt_model.featureImportances)
    # 保存模型
    model_path = project_url + '/gbdt'
    gbdt_model.write().overwrite().save(model_path)

    # 加载模型
    gbdt_model2 = GBTClassificationModel.load(model_path)

    # 预测
    gbdt_model2.transform(training_set).select("prediction", "label",
                                               "features").show(5)
Beispiel #26
0
    def run_gradient_boost(self):
        '''
        Method to run gradient boost on our transformed data.

        Input:
        -------
        None

        Output:
        -------
        Dictionary of confusion matrix scores for this particular model.
        '''

        # Instantiate model
        gbt = GBTClassifier(labelCol="label",
                            featuresCol="features",
                            maxIter=30
                            )

        # Train model.  This also runs the indexers.
        model = gbt.fit(self.trainingData)

        # Make predictions.
        predictions = model.transform(self.testData)

        # Write type of model to filename.
        with open(self.filename,'a') as f:
            f.write("\n\nGradient Boost:")

        # Create confusion matrix to see how well the model performed
        confusion_matrix = self.create_confusion_matrix(predictions)

        # Evaluate model's AUC.
        auc = self.evaluator.evaluate(predictions)
        print("AUC Score: ",str(auc))

        # Write result of model to filename.
        with open(self.filename,'a') as f:
            f.write("\nAUC Score: " + str(auc))

        return confusion_matrix
Beispiel #27
0
 def gbtModel(self, dfTrain, dfTest, col):
     client = mlflow.tracking.MlflowClient()
     mlflow.set_experiment("gML GBT")
     mlflow.end_run()
     mlflow.start_run()
     result = list()
     result.append("gbt")
     result.append(col)
     gbt = GBTClassifier(
         labelCol="label",
         featuresCol="features",
         predictionCol='prediction',
         maxDepth=5,
         maxBins=32,
         minInstancesPerNode=1,
         minInfoGain=0.0,
         maxMemoryInMB=256,
         cacheNodeIds=False,
         checkpointInterval=10,
         lossType='logistic',
         maxIter=20,
         stepSize=0.1,
         seed=None,
         subsamplingRate=1.0,
         featureSubsetStrategy='all',
     )
     model = gbt.fit(dfTrain)
     predictions = model.transform(dfTest)
     metrics = ["accuracy", "f1"]
     for metric in metrics:
         evaluator = MulticlassClassificationEvaluator(
             labelCol="label",
             predictionCol="prediction",
             metricName=metric)
         v = evaluator.evaluate(predictions)
         mlflow.log_metric(metric, v)
         temp = [metric, v]
         result.append(temp)
     mlflow.spark.log_model(model, "gbtModel")
     return result
Beispiel #28
0
# COMMAND ----------

# MAGIC %md
# MAGIC ####<span style="color:darkblue">Question 3</span>
# MAGIC 1. Train a GBTClassifier on the training data, call the trained model 'gbModel'

# COMMAND ----------

### Question 3.1 Answer ###
from pyspark.ml.classification import GBTClassifier
# Create initial LogisticRegression model
gb = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
gbModel = gb.fit(trainingData)

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ####Logistic Regression - Predict

# COMMAND ----------

# make predictions on test data
lrPredictions = lrModel.transform(testData)

# display predictions
display(lrPredictions.select("label", "prediction", "probability"))
#display(lrPredictions)
evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate")
AUCrf = evaluator.evaluate(RFpredictions)
print("Random Forest AUC = %g" % AUCrf)

#Feature importance code
RFimp = pd.Series(RFmodel.featureImportances.toArray(), index=featureNames)
print(RFimp.sort_values(ascending=False))

# ## Gradient-boosted trees
# Fit GBT model and make predictions.
#iterations is main hyperparam (30 is max), eta doesn't change in Y code
gbt = GBTClassifier(labelCol="is_duplicate",
                    maxIter=40,
                    stepSize=0.05,
                    maxDepth=4)
gbtModel = gbt.fit(trainingData)

# Make predictions
gbtPredictions = gbtModel.transform(testData)

# Calculate AUC
# Select (prediction, true label) and compute test error
evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate")
AUCgbt = evaluator.evaluate(gbtPredictions)
print("Gradient Boosted Trees AUC = %g" % AUCgbt)

# Analyze importance of features.
#Feature importance code
GBTimp = pd.Series(gbtModel.featureImportances.toArray(), index=featureNames)
print(GBTimp.sort_values(ascending=False))
Beispiel #30
0
    features=[]
    try:
        fe = r[1:-1]
        for i in range(len(fe)):
            features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D)
        target = float(r[-1])
        ID=float(r[0])
        return target, Vectors.dense(features)
    except:
        return (0.0,[0.0]*1932)
new_rdd = rdd.filter(lambda i : len(i)==1934)
rdd_after_trans = new_rdd.map(helper1)
rdd_after_trans.cache()
df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"])
(trainingData, testData) = rdd_pca1.randomSplit([0.7, 0.3])

stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(trainingData)
td = si_model.transform(trainingData)

gbt = GBTClassifier(maxIter=100, maxDepth=10, labelCol="label")
model = gbt.fit(trainingData)
result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0]))
result.saveAsTextFile("/user/demo/gbt_100_20")

df1 = sqlContext.createDataFrame([
     Row(label=0.0, features=Vectors.dense([0.0, 0.0])),
     Row(label=0.0, features=Vectors.dense([0.0, 1.0])),
     Row(label=1.0, features=Vectors.dense([1.0, 0.0]))])

Beispiel #31
0
output = assembler.transform(df)
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')
output_fixed = indexer.fit(output).transform(output)

# output_fixed.printSchema()

final_data = output_fixed.select(['features', 'PrivateIndex'])
train_data, test_data = final_data.randomSplit([0.7, 0.3])

dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

my_binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')
print("DTC")
print(my_binary_eval.evaluate(dtc_preds))

print("RFC")
print(my_binary_eval.evaluate(rfc_preds))

# gbt only has a prediction column
gbt_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex',
                                         rawPredictionCol='prediction')
spark = SparkSession.builder.appName('mytree').getOrCreate()

sample_test_data_path = 'test_input/trees/sample_libsvm_data.txt'

data = spark.read.format('libsvm').load(sample_test_data_path)

train_data, test_data = data.randomSplit([0.7, 0.3])

dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbtc = GBTClassifier()

dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbtc_model = gbtc.fit(train_data)

dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbtc_preds = gbtc_model.transform(test_data)

acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')

print 'DTC Accuracy:'
acc_eval.evaluate(dtc_preds)

print 'RFC Accuracy:'
acc_eval.evaluate(rfc_preds)

print 'GBTC Accuracy:'
acc_eval.evaluate(gbtc_preds)