def EvaluateModel(model, validationData): # Python version of the DecisionTreeModel can't handle currently equivalent of # "validationData.map(lambda point: (model.predict(point.features), point.label))" but Scala can.Hence I use zip() instead # scoresAndLabels = validationData.map(lambda point: (model.predict(point.features), point.label)) # metrics = BinaryClassificationMetrics(scoresAndLabels) if (model.__class__.__name__ == "DecisionTreeModel"): predictedLabel = model.predict( validationData.map(lambda line: line.features)) scoresAndLabels = validationData.map(lambda line: line.label).zip( predictedLabel) areaUnderROC = float( BinaryClassificationMetrics(scoresAndLabels).areaUnderROC) matchedNum = scoresAndLabels.filter( lambda (real, predicted): real == predicted).count() accRate = float(matchedNum) / validationData.count() else: scoresAndLabels = validationData.map( lambda line: (model.predict(line.features), line.label)).collect() scoresAndLabels = [[float(i), j] for i, j in scoresAndLabels] rdd_scoresAndLabels = globalVal.sc.parallelize(scoresAndLabels) areaUnderROC = float( BinaryClassificationMetrics(rdd_scoresAndLabels).areaUnderROC) matchedNum = rdd_scoresAndLabels.filter( lambda (real, predicted): real == predicted).count() accRate = float(matchedNum) / validationData.count() # matchedNum = validationData.map(lambda line: 1 if (model.predict(line.features) == line.label) else 0 ).sum() # accRate = float(matchedNum) / validationData.count() return areaUnderROC, accRate
def gen_lr_sort_model_metrics(test_df): from pyspark.ml.classification import LogisticRegressionModel logistic_regression_model = LogisticRegressionModel.load( "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model") lr_result = logistic_regression_model.evaluate(test_df).predictions lr_result.show() def vector_to_double(row): return float(row.click_flag), float(row.probability[1]) score_labels = lr_result.select(["click_flag", "probability"]).rdd.map(vector_to_double) score_labels.collect() from pyspark.mllib.evaluation import BinaryClassificationMetrics binary_classification_metrics = BinaryClassificationMetrics(scoreAndLabels=score_labels) area_under_roc = binary_classification_metrics.areaUnderROC print area_under_roc tp = lr_result[(lr_result.click_flag == 1) & (lr_result.prediction == 1)].count() tn = lr_result[(lr_result.click_flag == 0) & (lr_result.prediction == 1)].count() fp = lr_result[(lr_result.click_flag == 0) & (lr_result.prediction == 1)].count() fn = lr_result[(lr_result.click_flag == 1) & (lr_result.prediction == 0)].count() print "tp {} tn {} fp {} fn {}".format(tp, tn, fp, fn) print('accuracy is : %f' % ((tp + tn) / (tp + tn + fp + fn))) print('recall is : %f' % (tp / (tp + fn))) print('precision is : %f' % (tp / (tp + fp)))
def print_performance_metrics(predictions): # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}) print("auc = {}".format(auc)) print("aupr = {}".format(aupr)) # Get RDD of predictions and labels for eval metrics predictionAndLabels = predictions.select("prediction", "label").rdd # Instantiate metrics objects binary_metrics = BinaryClassificationMetrics(predictionAndLabels) multi_metrics = MulticlassMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = {}".format(binary_metrics.areaUnderPR)) # Area under ROC curve print("Area under ROC = {}".format(binary_metrics.areaUnderROC)) # Accuracy print("Accuracy = {}".format(multi_metrics.accuracy)) # Confusion Matrix print(multi_metrics.confusionMatrix()) # F1 print("F1 = {}".format(multi_metrics.fMeasure(1.0))) # Precision print("Precision = {}".format(multi_metrics.precision(1.0))) # Recall print("Recall = {}".format(multi_metrics.recall(1.0))) # FPR print("FPR = {}".format(multi_metrics.falsePositiveRate(1.0))) # TPR print("TPR = {}".format(multi_metrics.truePositiveRate(1.0)))
def random_forest(training_data, test_data, output_str): rf = RandomForestClassifier(labelCol="label", featuresCol="features") paramGrid = ParamGridBuilder() \ .addGrid(rf.numTrees, [20, 50, 80]) \ .addGrid(rf.maxDepth, [3, 5, 10, 15]) \ .build() evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") crossval = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) rfmodel = crossval.fit(training_data) rfPredictions = rfmodel.transform(test_data) # Evaluate bestModel found from Cross Validation accuracy = evaluator.evaluate(rfPredictions) output_str = output_str + "Random Forest accuracy is: " + str( accuracy) + "\n" predictionandLabels = rfPredictions.withColumn( 'label1', rfPredictions["label"].cast("double")).select("prediction", "label1").rdd metrics = BinaryClassificationMetrics(predictionandLabels) auroc = metrics.areaUnderROC aupr = metrics.areaUnderPR output_str = output_str + "RF Area under ROC Curve: " + str(auroc) + "\n" output_str = output_str + "RF Area under PR Curve: " + str(aupr) + "\n" return output_str
def naive_bayes(training_data, test_data, output_str): nb = NaiveBayes(modelType="multinomial") paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.01, 0.1, 1.0, 10, 100]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") cv = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) cvModel = cv.fit(training_data) cvPredictions = cvModel.transform(test_data) # Evaluate bestModel found from Cross Validation accuracy = evaluator.evaluate(cvPredictions) output_str = output_str + "Naive Bayes accuracy is: " + str( accuracy) + "\n" predictionandLabels = cvPredictions.withColumn( 'label1', cvPredictions["label"].cast("double")).select("prediction", "label1").rdd metrics = BinaryClassificationMetrics(predictionandLabels) auroc = metrics.areaUnderROC aupr = metrics.areaUnderPR output_str = output_str + "NB Area under ROC Curve: " + str(auroc) + "\n" output_str = output_str + "NB Area under PR Curve: " + str(aupr) + "\n" return output_str
def test_confusion_matrix(sdf): assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'], outputCol='features') rf = RandomForestClassifier(featuresCol='features', labelCol='Survived', numTrees=20) pipeline = Pipeline(stages=[assem, rf]) model = pipeline.fit(sdf.fillna(0.0)) predictions = model.transform(sdf.fillna(0.0)).select( 'probability', 'Survived') bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived') predictions = predictions.toHandy().to_metrics_RDD('probability', 'Survived') predictions = np.array(predictions.collect()) scm = bcm.confusionMatrix().toArray() pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .5) npt.assert_array_almost_equal(scm, pcm) scm = bcm.confusionMatrix(.3).toArray() pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .3) npt.assert_array_almost_equal(scm, pcm)
def plot_roc(model, test_data, name, label_col): transformed = model.transform(test_data) results = transformed.select(["probability", label_col]) results_collect = results.collect() results_list = [(float(i[0][1]), float(i[1])) for i in results_collect] score_and_labels = sc.parallelize(results_list) metrics = BinaryClassificationMetrics(score_and_labels) print("The ROC score for " + name + " is : ", metrics.areaUnderROC) y_test = [i[1] for i in results_list] y_score = [i[0] for i in results_list] fpr, tpr, thresholds = roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) for fp, tp, thresh in zip(fpr, tpr, thresholds): print("fpr: ", fp, " tpr: ", tp, " threshold: ", thresh) plt.clf() plt.figure() plt.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % roc_auc) plt.plot([0, 1], [0, 1], "k--") plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver operating characteristic for " + name) plt.legend(loc="lower right") if not os.path.isdir(os.path.join(script_dir, png_dir)): os.makedirs(os.path.join(script_dir, png_dir)) plt.savefig( os.path.join(script_dir, png_dir + name.replace(" ", "") + ".png"))
def model(classifiers, training, testing, week): results = "" timing = [] for classifier in classifiers: timeStart = time.time() clf = get_classifier(classifier) labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(training) prediction = model.transform(testing) metrics = BinaryClassificationMetrics( prediction.select("label", "prediction").rdd) results = results + "new," + classifier + "," + week + "," + str( metrics.areaUnderROC) + "," + str(metrics.areaUnderPR) + "\n" timing.append(time.time() - timeStart) return results, timing
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) score = score.map(lambda p: float(p)) scoreAndLabels = score.zip(validationData.map(lambda p: p.label)) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC = metrics.areaUnderROC return AUC
def Print_class_info(xy_predict): ''' 打印和分类效果有关的信息 xy_predict:模型预测的数据集 ''' predict_and_target_rdd = xy_predict.rdd.map( lambda row: (float(row.prediction), float(row.label))) metrics = BinaryClassificationMetrics(predict_and_target_rdd) correct_amount = xy_predict.filter( xy_predict['label'] == xy_predict['prediction']).count() total_amount = xy_predict.count() accuracy_rate = float(correct_amount) / total_amount positive_precision_amount = xy_predict.filter( xy_predict['label'] == 1).filter( xy_predict['prediction'] == 1).count() positive_amount = xy_predict.filter(xy_predict['label'] == 1).count() predict_amount = xy_predict.filter(xy_predict['prediction'] == 1).count() recall_rate = float(positive_precision_amount) / positive_amount precision_rate = float(positive_precision_amount) / predict_amount print '----------------------------------------------' print "Precision score: %s" % precision_rate print "Recall score: %s" % recall_rate print "Accuracy score: %s" % accuracy_rate print "Area under PR: %s" % metrics.areaUnderPR print "Area under ROC: %s" % metrics.areaUnderROC print '----------------------------------------------'
def predict(): testData = MLUtils.loadLibSVMFile(sc,INPUT_DATA_PATH) print("[INFO] load complete.") model = RandomForestModel.load(sc,TEST_MODEL_PATH) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) lst = predictions.collect() with open(TEST_PREDICT_PATH+"/"+time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())+".txt",'w') as f: for k in lst: f.write(str(k)+"\n") labelsAndPredictions = testData.map(lambda lp: tobin(lp.label)).zip(predictions.map(lambda lp: tobin(lp))) #print(labelsAndPredictions.collect()) metrics = BinaryClassificationMetrics(labelsAndPredictions) # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC) #print(labelsAndPredictions.collect()) testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('[INFO] Test Error = ' + str(testErr))
def svmClassification(trainSetFile,testSetFile): data1 = sc.textFile(directory_supervised + trainSetFile) trainData = data1.map(parsePoint) data2 = sc.textFile(directory_supervised + testSetFile) testData = data2.map(parsePoint) # Build the model model = SVMWithSGD.train(trainData, iterations=10) # Evaluating the model on training data '''labelsAndPreds = trainData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainData.count()) print("Training Error = " + str(trainErr)) labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) print("Test Error = " + str(testErr)) return testErr''' #labelsAndPreds = testData.map(lambda p: (p.label, float(model.predict(p.features)))) #truePos = labelsAndPreds.filter(lambda p: p[0] == p[1]).count() #print("True pos : " + str(truePos)) #metrics1 = MulticlassMetrics(labelsAndPreds) #print("Recall : " + str(metrics1.recall())) #print("Precision : " + str(metrics1.precision())) #print(metrics1.confusionMatrix()) model.clearThreshold() scoreAndLabels = testData.map(lambda p: (float(model.predict(p.features)), p.label)) metrics = BinaryClassificationMetrics(scoreAndLabels) return metrics.areaUnderROC
def score(model, test_data): predictions = model.predict(test_data.map(lambda x: x.features)) lables = test_data.map(lambda x: x.label) labels_and_preds = predictions.zip(lables) metrics = BinaryClassificationMetrics(labels_and_preds) return (metrics.areaUnderPR, metrics.areaUnderROC)
def predict_SVMWithSGD(numIterations, step, regParam, regType): """ SVMWithSGD.train(data,iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2',intercept=False, validateData=True,convergenceTol=0.001) data: the training data, an RDD of LabeledPoint iterations: the number of iterations, default 100 step: the step parameter used in SGD, default 1.0 regParam: the regularizer parameter, default 0.01 miniBatchFraction: fraction of data to be used for each SGD iteration, default 1.0 initialWeights: the initial weights, default None regType: the type of regularizer used for training our model, allowed values ('l1':for using L1 regularization; 'l2':for using L2 regularization, default; None: for no regularization) intercept: boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias feature are activated or not, default False) validateData: boolean parameter which indicates if the algorithm should validate data before training, default True convergenceTol: a condition which decides iteration termination, default 0.001 """ svmModel = SVMWithSGD.train(scaledData, iterations=numIterations, step=step, regParam=regParam, regType=regType) svmMetrics = scaledData.map(lambda p: (svmModel.predict(p.features), p.label)) svmAccuracy = svmMetrics.filter( lambda (actual, pred): actual == pred).count() * 1.0 / data.count() metrics = BinaryClassificationMetrics(svmMetrics) #print "SVMWithSGD model accuracy is: %f in %d iterations,step:%f;regParam:%f;regType:%s" % (svmAccuracy, numIterations,step,regParam,regType) return svmAccuracy
def predictData(sc, model): #----------------------1.匯入並轉換資料------------- print("開始匯入資料...") rawDataWithHeader = sc.textFile("s3n://bigdata17demo/train.csv") header = rawDataWithHeader.first() rawData = rawDataWithHeader.filter(lambda x: x != header) lines = rawData.map(lambda x: x.split(",")) print("共計:" + str(lines.count()) + "筆") #----------------------2.建立訓練評估所需資料 LabeledPoint RDD------------- labelpointRDD = lines.map( lambda r: LabeledPoint(extract_label(r), extract_features(r))) print(labelpointRDD.first()) testData = labelpointRDD print("testData:" + str(testData.count())) labelsAndPreds = testData.map(lambda p: (p.label, float(model.predict(p.features)))) metrics = BinaryClassificationMetrics(labelsAndPreds) print("Area under PR = %s" % metrics.areaUnderPR) print("Area under ROC = %s" % metrics.areaUnderROC) testErr = labelsAndPreds.filter( lambda seq: seq[0] != seq[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) #----------------------4.進行預測並顯示結果-------------- # 把預測結果寫出來 f = open('workfile', 'w') for lp in labelpointRDD.take(499999): predict = int(model.predict(lp.features)) dataDesc = " " + str(predict) + " " f.write(dataDesc) f.close()
def main(): start = time.time() conf = SparkConf().setMaster("local").setAppName("income") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) income_df = load(sqlContext, csv_path=CSV_PATH) # income_df.show() # print(income_df.dtypes) # print(income_df.count()) features_df = preprocess(data_frame=income_df) # train, test split train_df, test_df = features_df.randomSplit([7.0, 3.0], 100) # logistic regression income_lr = LogisticRegression(featuresCol="features", labelCol="income_index", regParam=0.0, elasticNetParam=0.0, maxIter=200) income_model = income_lr.fit(train_df) # modeling print("Training:") training_summary = income_model.summary training_FPR = training_summary.roc.select('FPR').collect() training_TPR = training_summary.roc.select('TPR').collect() plot_roc(training_FPR, training_TPR, "pic/training_roc.jpg") training_recall = training_summary.pr.select('recall').collect() training_precision = training_summary.pr.select('precision').collect() # Area under ROC curve print("Training Area under ROC = %s" % training_summary.areaUnderROC) # accuracy print("Training Accuracy = %s" % training_summary.accuracy) plot_pr(training_recall, training_precision, "pic/training_pr.jpg") # evaluation print() print("Evaluation:") pred_df = income_model.transform(test_df).select("prediction", "income_index") raw_pred_df = income_model.transform(test_df).select( "probability", "income_index").rdd.map(lambda l: (float(l[0][1]), l[1])) metrics = BinaryClassificationMetrics(raw_pred_df) # Area under ROC curve print("Testing Area under ROC = %s" % metrics.areaUnderROC) # accuracy metrics = MulticlassMetrics(pred_df.rdd) print("Testing Accuracy = %s" % metrics.accuracy) # confusion matrix print("Testing Confusion Matrix:") print(metrics.confusionMatrix().toArray()) print("Total cost %fs" % (time.time() - start)) print("Done!")
def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) print score scoreAndLabels = score.zip(validationData.map(lambda p: p.label)).map(lambda (x,y): (float(x),float(y))) print scoreAndLabels.take(1) metrics = BinaryClassificationMetrics(scoreAndLabels) return metrics.areaUnderROC
def evaluate_model(model, validationData): from pyspark.mllib.evaluation import BinaryClassificationMetrics score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels = score.zip(validationData.map(lambda p: p.label)) scoreAndLabels.take(5) metrics = BinaryClassificationMetrics(scoreAndLabels) print("auc: ", metrics.areaUnderROC) return metrics.areaUnderROC
def EvaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) # 这里的score是int,要转换为float score = score.map(lambda x: float(x)) scoreAndLables = score.zip(validationData.map(lambda p: p.label)) metric = BinaryClassificationMetrics(scoreAndLables) AUC = metric.areaUnderROC return (AUC)
def main(sc): train_data = sc.textFile( "/data/scratch/vw/criteo-display-advertising-dataset/train.txt").map( parsePoint) model = LogisticRegressionWithSGD.train(train_data, iterations=1000, miniBatchFraction=0.0001, step=.001, regType="l2") valid_data = sc.textFile("input/valid_data.txt").map(parsePoint) labelsAndPreds = valid_data.map( lambda p: (float(model.predict(p.features)), p.label)) Accuracy = labelsAndPreds.filter( lambda (pred, lab): lab == pred).count() / float(valid_data.count()) FP = labelsAndPreds.filter(lambda (pred, lab): lab == 0 and pred == 1).count() N = float(labelsAndPreds.filter(lambda (pred, lab): lab == 0).count()) FPR = FP / N output = "Accuracy valid = " + str(Accuracy) + "\nFPR valid = " + str(FPR) print output metrics = BinaryClassificationMetrics(labelsAndPreds) output += "Area under ROC valid = " + str(metrics.areaUnderROC) print output test_data = sc.textFile( "/data/scratch/vw/criteo-display-advertising-dataset/test.txt").map( parsePoint) labelsAndPreds = test_data.map(lambda p: (float(model.predict(p.features)), p.label)) Accuracy = labelsAndPreds.filter( lambda (pred, lab): lab == pred).count() / float(test_data.count()) FP = labelsAndPreds.filter(lambda (pred, lab): lab == 0 and pred == 1).count() N = float(labelsAndPreds.filter(lambda (pred, lab): lab == 0).count()) FPR = FP / N output += "\nAccuracy test = " + str(Accuracy) + "\nFPR test = " + str(FPR) print output metrics = BinaryClassificationMetrics(labelsAndPreds) output += "Area under ROC test = " + str(metrics.areaUnderROC) print output output = sc.parallelize([output]) output.saveAsTextFile("str")
def evaluate_lrmodel(): scoreAndLabels = data.map( lambda point: (float(lrModel.predict(point.features)), point.label)) metrics = BinaryClassificationMetrics(scoreAndLabels) #area under prcesion-recall print "area under PR %f" % metrics.areaUnderPR #area under ROC print "area under ROC %f" % metrics.areaUnderROC
def printRddBinaryClassificationMetrics(self, predictions_and_labels): metrics = BinaryClassificationMetrics(predictions_and_labels) print "KAPPA=" + str( self.computeKappa(np.array(metrics.confusionMatrix().toArray()))) print "BA=" + str( self.computeBA(np.array(metrics.confusionMatrix().toArray()))) CMarray = metrics.confusionMatrix().toArray() #CMstring = ','.join(['%.5f' % num for num in CMarray]) print "CM=" + str(CMarray)
def get_model_eval_metrics(labeled_test_comments, model, model_name, label_name): transformed_comments = model.transform(labeled_test_comments) predictionAndLabels = transformed_comments.rdd.map(\ lambda lp: (float(lp.probability[1]), float(lp[label_name]))\ ) metrics = BinaryClassificationMetrics(predictionAndLabels) print("auPR for {}: {}".format(model_name, metrics.areaUnderPR)) print("auROC for {}: {}".format(model_name, metrics.areaUnderROC))
def evaluate_model(model, valid_data): # 返回的是int 型 的 0, 1 score = model.predict(valid_data.map(lambda p: p.features)) # score_and_labels = score.map(lambda x: float(x)).zip( valid_data.map(lambda p: p.label)) metrics = BinaryClassificationMetrics(score_and_labels) AUC = metrics.areaUnderROC return AUC
def printMetrics(predictions_and_labels): metrics = MulticlassMetrics(predictions_and_labels) metrics2 = BinaryClassificationMetrics(predictions_and_labels) print('Precision of True ', metrics.precision(1)) print('Precision of False', metrics.precision(0)) print('Recall of True ', metrics.recall(1)) print('Recall of False ', metrics.recall(0)) print('areaUnderROC ', metrics2.areaUnderROC) print('areaUnderPR ', metrics2.areaUnderPR)
def get_auc_roc(classifier, training, test): model = classifier.fit(training) out = model.transform(test) \ .select("prediction", "label") \ .rdd.map(lambda x: (float(x[0]), float(x[1]))) metrics = BinaryClassificationMetrics(out) print("Model: {1}. Area under ROC: {0:2f}".format(metrics.areaUnderROC, clf.__class__)) return model, out, metrics
def main(): # Reading from the hdfs, removing the header # read the titanic train, test csv here trainTitanic = sc.textFile( srcDir + "titanic_train.csv") # remove the header trainHeader = trainTitanic.first() trainTitanic = trainTitanic.filter(lambda line: line != trainHeader).mapPartitions(lambda x: csv.reader(x)) trainTitanic.first() # Data Transformations and filter lines with empty strings trainTitanic=trainTitanic.map(lambda line: line[1:3]+sexTransformMapper(line[4])+line[5:11]) trainTitanic=trainTitanic.filter(lambda line: line[3] != '' ).filter(lambda line: line[4] != '' ) trainTitanic.take(10) # creating "labeled point" rdds specific to MLlib "(label (v1, v2...vp])" trainTitanicLP=trainTitanic.map(lambda line: LabeledPoint(line[0],[line[1:5]])) trainTitanicLP.first() # splitting dataset into train and test set # 70% train, 30% test (trainData, testData) = trainTitanicLP.randomSplit([0.7, 0.3]) # Random forest : same parameters as sklearn (?) from pyspark.mllib.tree import RandomForest time_start=time.time() model_rf = RandomForest.trainClassifier(trainData, numClasses = 2, categoricalFeaturesInfo = {}, numTrees = 100, featureSubsetStrategy='auto', impurity='gini', maxDepth=12, maxBins=32, seed=None) model_rf.numTrees() model_rf.totalNumNodes() time_end=time.time() time_rf=(time_end - time_start) print("RF takes %d s" %(time_rf)) # Predictions on test set predictions = model_rf.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) # first metrics from pyspark.mllib.evaluation import BinaryClassificationMetrics metrics = BinaryClassificationMetrics(labelsAndPredictions) print ('=====================================================') print (' output : ') # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC) print ('=====================================================')
def evaluateModel(model, validationData): # 计算AUC(ROC曲线下的面积) score = model.predict(validationData.map(lambda x: x.features)) print(score) scoreAndLabels = score.zip(validationData.map(lambda x: x.label)) print("scoreAndLabels的前5项", scoreAndLabels.take(5)) metrics = BinaryClassificationMetrics(scoreAndLabels) AUC = metrics.areaUnderROC return (AUC)
def evaluateModel(model, validationData): """ 模型评估AUC """ score = model.predict(validationData.map(lambda p: p.feature)) scoreAndLabels = score.zip( validationData.map(lambda p: p.label)) # [(s1, l1), (s2, l2), ...] metrics = BinaryClassificationMetrics(scoreAndLabels) AUC = metrics.areaUnderROC return AUC
def evaluateMetrics(model, data, label): labelsAndScores = data.map(lambda lp: ( lp.label, getP(lp.features, model.weights, model.intercept))) auc = BinaryClassificationMetrics(labelsAndScores).areaUnderROC log_loss = evaluateResults(model, data) sys.stderr.write('\n LogLoss {0} = {1}'.format(label, log_loss)) sys.stderr.write('\n AUC {0} = {1}\n'.format(label, auc)) return (label, log_loss, auc)