def build_randomForest(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show() rdf = RandomForestClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\ .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator) cvModel = rdf.fit(df) prediction = cvModel.transform(df) prediction.show() print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def build_decisionTree(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show(truncate=False) dt = DecisionTreeClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def testClassification(data): # Train a GradientBoostedTrees model. stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") si_model = stringIndexer.fit(data) td = si_model.transform(data) rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13) trainData,testData = td.randomSplit([0.8,0.2],13) predictionDF = rf.fit(trainData).transform(testData) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.collect(): print row scoresAndLabels = predictionDF\ .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel)) for sl in scoresAndLabels.collect(): print sl evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC') metric = evaluator.evaluate(selected) print metric
def buil_lrmodel(path): df = load_data(path) #-------------------- preparing the dataset ------------------------------------------- avg_age = find_avg_age(df) df = data_preparation(df, avg_age) print "count = " , df.count() df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') #------------------ Build a model ---------------------------------------------------- lr = LogisticRegression(maxIter=10, regParam=0.01) model = lr.fit(df) prediction = model.transform(df) prediction.show(truncate=False) evaluator = BinaryClassificationEvaluator() print "classification evaluation :" , evaluator.evaluate(prediction) #-------------- selecting models with cross validation ----------------------------------- lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\ .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def main(): ''' takes one input argument :: Location of the directory for training and test data files. :return: Print output on console for the area under the ROC curve. ''' conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) numFeatures = (1000, 5000, 10000) regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() model_cv = cv.fit(trainDF) prediction_cv = model_cv.transform(testDF) print evaluator.evaluate(prediction) print evaluator.evaluate(prediction_cv)
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
def evaluate(predictions, spark_metrics): # using sklearn metrics y_hat = predictions.rdd.map(lambda p: p.prediction).collect() y_true = predictions.rdd.map(lambda p: p.label).collect() print metrics.classification_report(y_true, y_hat) print 'AUC score: %f' % metrics.roc_auc_score(y_true, y_hat) print("Accuracy: %f" % metrics.accuracy_score(y_true, y_hat)) # using spark metrics result = [] for metric in spark_metrics: eval = BinaryClassificationEvaluator().setMetricName(metric) result.append(eval.evaluate(predictions)) return result
def pipelineRF(dataDF): """ :param train_data: :return: """ print('pipeline starting...') labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF) featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\ .fit(dataDF) #dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5, # maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy') rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features', maxBins=40,seed=13) pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator]) paramGrid = ParamGridBuilder()\ .addGrid(rfEstimator.maxDepth,[5,10,30])\ .addGrid(rfEstimator.numTrees,[20,50,100]).build() evaluator =BinaryClassificationEvaluator(labelCol='indexLabel', rawPredictionCol='rawPrediction', metricName='areaUnderROC') cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) cvModel = cv.fit(dataDF) print("pipeline end..., cvModel was fit using parameters:\n") pprint(cvModel.explainParams()) predictionDF = cvModel.transform(dataDF) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.take(5): print row aucMetric = evaluator.evaluate(selected) print("auc of test data is:%.3f" % aucMetric)
# Delayed flights with Gradient-Boosted Trees # You've previously built a classifier for flights likely to be delayed using a Decision Tree. In this exercise you'll compare a Decision Tree model to a Gradient-Boosted Trees model. # The flights data have been randomly split into flights_train and flights_test. # Instructions # 100 XP # Instructions # 100 XP # Import the classes required to create Decision Tree and Gradient-Boosted Tree classifiers. # Create Decision Tree and Gradient-Boosted Tree classifiers. Train on the training data. # Create an evaluator and calculate AUC on testing data for both classifiers. Which model performs better? # Find the number of trees and the relative importance of features in the Gradient-Boosted Tree classifier. from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator # Create model objects and train on training data tree = DecisionTreeClassifier().fit(flights_train) gbt = GBTClassifier().fit(flights_train) # Compare AUC on testing data evaluator = BinaryClassificationEvaluator() evaluator.evaluate(tree.transform(flights_test)) evaluator.evaluate(gbt.transform(flights_test)) # Find the number of trees and the relative importance of features print(gbt.getNumTrees) print(gbt.featureImportances)
# Save the best model lrModel = lrCvModel.bestModel mlflow.spark.save_model(lrModel, "model/" + algorithm) # Log some parameters mlflow.log_param("Algorithm", algorithm) mlflow.log_param("regParam", lrCvModel.bestModel.stages[-1]._java_obj.getRegParam()) mlflow.log_param( "elasticNetParam", lrCvModel.bestModel.stages[-1]._java_obj.getElasticNetParam()) mlflow.log_param("maxIter", lrCvModel.bestModel.stages[-1]._java_obj.getMaxIter()) # Log some metrics mlflow.log_metric("auc", evaluator.evaluate(lrPredictions)) # Log ROC plot plotFile = plot_roc(predictions=lrPredictions, algorithm=algorithm) mlflow.log_artifact(plotFile) # COMMAND ---------- # MAGIC %md # MAGIC Decision Tree Experiment # COMMAND ---------- # Decision Tree with mlflow.start_run():
col_map = {v[0]: i for i, v in enumerate(sorted(tuple(value_counts.items()), key=lambda x: x[1], reverse=True))} df = df.withColumn(transform_f, fn.udf(lambda x: col_map.get(x), IntegerType())(df[transform_f])) for f, d in df.dtypes: if d == 'string': df = df.withColumn(f, df[f].cast('int')) if f == 'class': df = df.withColumn(f, df[f].cast('string')) df = df.dropna() # 加载模型 load_pipeline = PipelineModel.load('file:///D:/python_test/spark_ml/pipeline') test_predict = load_pipeline.transform(df) evaluator = BinaryClassificationEvaluator( rawPredictionCol='rawPrediction', labelCol='label' ) print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderPR'})) origin_test_df = df.select(feature_cols) predict_df = load_pipeline.transform(origin_test_df) print(predict_df.show(20))
if __name__ == '__main__': df = sqlContext.read.parquet('/data/intermediate_data/cdr_step5_1/') df_test = sqlContext.read.parquet('/data/intermediate_data/cdr_step5/') label_indexer = StringIndexer(inputCol='churned', outputCol='label').fit(df) reduced_numeric_cols = [ "coefficiant_of_variance_in", "coefficiant_of_variance_out", "call_count_in", "call_count_out" ] assembler = VectorAssembler(inputCols=reduced_numeric_cols, outputCol='features') assembler.transform(df) # (train, test) = df_test.randomSplit([0.4, 0.6]) classifier = RandomForestClassifier(labelCol='label', featuresCol='features') pipeline = Pipeline(stages=[label_indexer, assembler, classifier]) model = pipeline.fit(df) predictions = model.transform(df_test) predictions.write.mode("overwrite").saveAsTable( "cdr_step6_1", format="parquet", path="/data/intermediate_data/cdr_step6_1/") evaluator = BinaryClassificationEvaluator(labelCol="churned", rawPredictionCol="rawPrediction", metricName="areaUnderROC") precision = evaluator.evaluate(predictions) print("Precision= %g" % precision)
evaluator = BinaryClassificationEvaluator() # Train a decision tree with default parameters (including maxDepth=5) dt_classifier_default = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'TFIDF', maxDepth=5) # Create an ML pipeline for the decision tree model dt_pipeline_default = Pipeline(stages=[label_indexer, dt_classifier_default]) # Apply pipeline and train model dt_model_default = dt_pipeline_default.fit(train_tfidf) # Apply model on devlopment data dt_predictions_default_dev = dt_model_default.transform(dev_tfidf) # Evaluate model using the AUC metric auc_dt_default_dev = evaluator.evaluate(dt_predictions_default_dev, {evaluator.metricName: 'areaUnderROC'}) # Print result to standard output print('Decision Tree, Default Parameters, Development Set, AUC: ' + str(auc_dt_default_dev)) # TODO: Check for signs of overfitting (by evaluating the model on the training set) # [FIX ME!] Write code below # TODO: Tune the decision tree model by changing one of its hyperparameters # Build and evalute decision trees with the following maxDepth values: 3 and 4. # [FIX ME!] Write code below # Train a random forest with default parameters (including numTrees=20) rf_classifier_default = RandomForestClassifier(labelCol = 'label', featuresCol = 'TFIDF', numTrees=20) # Create an ML pipeline for the random forest model
from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True) lrmodel = lr.fit(adulttrain) lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain) lrmodel.weights lrmodel.intercept #section 8.2.3 validpredicts = lrmodel.transform(adultvalid) from pyspark.ml.evaluation import BinaryClassificationEvaluator bceval = BinaryClassificationEvaluator() bceval.evaluate(validpredicts) bceval.getMetricName() bceval.setMetricName("areaUnderPR") bceval.evaluate(validpredicts) #section 8.2.5 from pyspark.ml.tuning import CrossValidator from pyspark.ml.tuning import ParamGridBuilder cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(adulttrain) cvmodel.bestModel.weights BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))
# COMMAND ---------- # DBTITLE 1,3 Building the model from pyspark.ml.classification import DecisionTreeClassifier dtc = DecisionTreeClassifier() bc_model = dtc.fit(train) # COMMAND ---------- # DBTITLE 1,4 Testing your model from pyspark.ml.evaluation import BinaryClassificationEvaluator predictions = bc_model.transform(test) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label", metricName='areaUnderROC') areaUnderROC = evaluator.evaluate(predictions) accuracy = predictions.filter("label=prediction").count() / test.count() print(areaUnderROC, accuracy) # COMMAND ---------- # DBTITLE 1,5 Improving the model from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.tuning import CrossValidator, ParamGridBuilder paramGrid = ParamGridBuilder().addGrid(dtc.maxDepth, [1, 3, 5]).addGrid( dtc.maxBins, [2, 32]).build() crossval = CrossValidator(estimator=dtc, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
user_rf_param_numFolds = 2 #Settings for Random Forest - Paramaters Grid Search rf_paramGrid = ParamGridBuilder().addGrid(rfclassifier.numTrees, user_rf_param_numTreeSet).addGrid(rfclassifier.maxDepth, user_rf_param_maxDepthSet).addGrid(rfclassifier.impurity, user_rf_param_impuritySet).build() evaluator = BinaryClassificationEvaluator() multiEvaluator = MulticlassClassificationEvaluator() #Setting Paramaters for Crossvalidation rf_cv = CrossValidator( estimator=pipeline, evaluator=evaluator, estimatorParamMaps=rf_paramGrid, numFolds=user_rf_param_numFolds) rf_cvmodel = rf_cv.fit(train) #Evaluating Random Forest Model Performance from pyspark.sql.functions import udf rf_predictions = rf_cvmodel.transform(test) auroc = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderPR"}) "The AUROC is %s and the AUPR is %s" % (auroc, aupr) f1score = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "f1"}) weightedPrecision = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "weightedPrecision"}) weightedRecall = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "weightedRecall"}) "The F1 score: %s the Weighted Precision: %s the Weighted Recall is %s" % (f1score, weightedPrecision, weightedRecall) #Select the Random Forest Best Model after Crossvalidation rfmodel = rf_cvmodel.bestModel bestRFModel = rfmodel.stages[-1] #Retrieving Paramaters from the Best RF Model param_BestModel_NumTrees = bestRFModel._java_obj.getNumTrees()
rawPredictionCol='prediction', metricName='areaUnderROC') # In[71]: #generate splits for cross validation splits = indexedData.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2]) # In[72]: TotalAccuracy = 0 for i in range(5): testIndex = splits[i].select('id').collect() #get test index for each fold rdd = sc.parallelize(testIndex) test_rdd = rdd.flatMap(lambda x: x).collect() test_Data = indexedData.filter( indexedData.id.isin(test_rdd)) #get test data for each fold train_Data = indexedData.filter( ~indexedData.id.isin(test_rdd)) #get train data for each model model = nb.fit(train_Data) #fit train data to model transformed_data = model.transform(test_Data) # evaluate test data accuracy = binaryEvaluator.evaluate( transformed_data) # get accuracy for test data print(binaryEvaluator.getMetricName(), 'accuracy:', accuracy) TotalAccuracy = TotalAccuracy + accuracy averageAccuracy = TotalAccuracy / 5 # get average accuracy print(averageAccuracy)
from pyspark.ml import Pipeline #pipeline_lr = Pipeline().setStages((assembler,lr)) log_model = lr.fit(trainingData) pred = log_model.transform(testingData) # Print the coefficients and intercept for logistic regression print("Coefficients: " + str(log_model.coefficients)) print("Intercept: " + str(log_model.intercept)) #Computing range of metrics for each of the algorithms from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC") auroc = binary_evaluator.evaluate(pred) print('auroc: {}'.format(auroc)) #auroc: 0.671 pred_sub = pred.select("label", "prediction") tn = pred_sub.filter((pred_sub.label==0) & (pred_sub.prediction==0)).count() tp = pred_sub.filter((pred_sub.label==1) & (pred_sub.prediction==1)).count() fp = pred_sub.filter((pred_sub.label==0) & (pred_sub.prediction==1)).count() fn = pred_sub.filter((pred_sub.label==1) & (pred_sub.prediction==0)).count() precision = tp / (tp + fp) accuracy = (tp + tn) / (tn+tp+fp+fn) #0.89 recall = tp / (tp + fn)
def testModel(model, validate=validate): pred = model.transform(validate) evaluator = BinaryClassificationEvaluator(labelCol='index') return evaluator.evaluate(pred)
def evaluate(preds): evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label") return evaluator.evaluate(preds)
y_pred.filter(y_pred.label == y_pred.prediction).count() / y_pred.count() evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName='areaUnderROC') evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") acc = evaluatorMulti.evaluate(y_pred, {evaluatorMulti.metricName: "accuracy"}) precision = evaluatorMulti.evaluate( y_pred, {evaluatorMulti.metricName: "precisionByLabel"}) recall = evaluatorMulti.evaluate(y_pred, {evaluatorMulti.metricName: "recallByLabel"}) f1 = evaluatorMulti.evaluate(y_pred, {evaluatorMulti.metricName: "f1"}) roc_auc = evaluator.evaluate(y_pred) print("accuracy: %f, precision: %f, recall: %f, f1: %f, roc_auc: %f" % (acc, precision, recall, f1, roc_auc)) ################################################## # GBM ################################################## gbm = GBTClassifier(maxIter=100, featuresCol="features", labelCol="label") gbm_model = gbm.fit(train_df) y_pred = gbm_model.transform(test_df) y_pred.show(5) y_pred.filter(y_pred.label == y_pred.prediction).count() / y_pred.count()
new_data = assembler.transform(data) final_data = new_data.select('features','shares') from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result") result = discretizer.fit(final_data).transform(final_data) finalData = result.select('result','features') from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features') train_data,test_data = finalData.randomSplit([0.7,0.3]) rfc_model = rfc.fit(train_data) result = rfc_model.transform(test_data); from pyspark.ml.evaluation import BinaryClassificationEvaluator acc_eval = BinaryClassificationEvaluator(labelCol='result') print(acc_eval.evaluate(result)) test_data.head(1) # import os, sys # import pandas # import plotly.plotly as py # from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # import cufflinks as cf # import plotly.graph_objs as go # init_notebook_mode(connected=True) # sys.path.append("".join([os.environ["HOME"]])) # result.columns # predictions_pdf = result.select('result', 'features', 'rawPrediction', 'probability', 'prediction').toPandas() # cumulative_stats = predictions_pdf.groupby(['prediction']).count() # product_data = [go.Pie(labels=cumulative_stats.indexGENDER, values=cumulative_stats['features'])]
pred_and_labels = fitted_churn_model.evaluate(test_churn) # In[42]: pred_and_labels.predictions.show() # ### Using AUC # In[24]: churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn') # In[26]: auc = churn_eval.evaluate(pred_and_labels.predictions) # In[43]: auc # [Common question - what is a good AUC value?](https://stats.stackexchange.com/questions/113326/what-is-a-good-auc-for-a-precision-recall-curve) # ### Predict on brand new unlabeled data # # We still need to evaluate the new_customers.csv file! # In[28]: final_lr_model = lr_churn.fit(final_data)
evaluation = dict() evaluation["metrics"] = dict() threshold = {'mid_value': 0.7, 'min_value': 0.3, 'metric': 'accuracyScore'} # replace "label" below with the numeric representation of the label column that you defined while training the model labelCol = "label" # create evaluator from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(labelCol=labelCol) # compute evaluations evaluation["metrics"]["accuracyScore"] = predictions.rdd.filter(lambda x: x[ labelCol] == x["prediction"]).count() * 1.0 / predictions.count() evaluation["metrics"]["areaUnderPR"] = evaluator.evaluate( predictions, {evaluator.metricName: "areaUnderPR"}) evaluation["metrics"]["areaUnderROC"] = evaluator.evaluate( predictions, {evaluator.metricName: "areaUnderROC"}) evaluation["metrics"]["threshold"] = threshold if (evaluation["metrics"][threshold.get('metric', 'INVALID_METRIC')] >= threshold.get('mid_value', 0.70)): evaluation["performance"] = "good" elif (evaluation["metrics"][threshold.get('metric', 'INVALID_METRIC')] <= threshold.get('min_value', 0.25)): evaluation["performance"] = "poor" else: evaluation["performance"] = "fair" evaluation["modelName"] = "Customer_churn_CHAID_Modeler" evaluation["startTime"] = int(time.time())
def evaluate_roc_auc(predictions, sqlc): raw = scores_and_labels(predictions, sqlc) evaluator = BinaryClassificationEvaluator() return evaluator.evaluate(raw)
# Generate predictions on the test DataFrame: test_with_prediction = log_reg_model.transform(df_test) # test_with_prediction.show(5) test_with_prediction.select("Class","rawPrediction","probability","prediction").show(5) # **Note:** The resulting DataFrame includes three types of predictions. The # `rawPrediction` is a vector of log-odds, `prediction` is a vector or # probabilities `prediction` is the predicted class based on the probability # vector. # Create an instance of `BinaryClassificationEvaluator` class: from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Class", metricName="areaUnderROC") print(evaluator.explainParams()) evaluator.evaluate(test_with_prediction) # Evaluate using another metric: evaluator.setMetricName("areaUnderPR").evaluate(test_with_prediction) # ## Score out a new dataset # There are two ways to score out a new dataset. # **Method1:** The `evaluate` method # The more expensive way is to use the `evaluate` method of the # `LogisticRegressionModel` class. The `predictions` attribute of the # resulting `BinaryLogisticRegressionSummary` instance contains the scored # DataFrame:
trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() print evaluator.evaluate(prediction) '''sbaronia - setting up parameters using ParamGridBuilder with 3 different features and 9 diff regParam''' param_Grid = (ParamGridBuilder() .addGrid(hashingTF.numFeatures, [1000, 5000, 10000]) .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) .build()) '''sbaronia - creating a new CrossValidator that will use above parameters and use same evaluator with 2 folds cross validation''' cross_val = (CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator)
assemblerInputs = indexedCategoricalCols + numericColList assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") df = assembler.transform(df) # Indexing binary labels labeller = StringIndexer(inputCol=label, outputCol="label").fit(df) df = labeller.transform(df).select(["features", "label"]) ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100) #dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") dt = LogisticRegression(regParam=0.01) model = dt.fit(trainingData) # Make predictions. predictions = model.transform(testData) evaluator = Evaluator() # Select example rows to display. predictions.select("prediction", "label", "features").show() # Evaluate the learned model print("LogRegression Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions))) model = NaiveBayes(thresholds=[0.1, 1.0]) model = dt.fit(trainingData) predictions = model.transform(testData) predictions.select("prediction", "label", "features").show() print("Bayes Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions)))
#lrModel = lr.fit(trainingData) # build the pipeline pipeline = Pipeline(stages=[ regexTokenizer, stopwordsRemover, countVectors, label_stringIdx, lr ]) # Fit the pipeline to training documents. pipelineFit = pipeline.fit(trainingData) predictions = pipelineFit.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("SentimentText","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 1) \ .select("SentimentText","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # Evaluate, metricName=[accuracy | f1]default f1 measure evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label") print("F1: %g" % (evaluator.evaluate(predictions))) # save the trained model for future use pipelineFit.save("logreg.model1") # PipelineModel.load("logreg.model")
VectorAssembler( inputCols=["{0}_counts".format(i) for i in range(1, n + 1)], outputCol="features") ] nb = [NaiveBayes(smoothing=1.0, modelType="multinomial")] return Pipeline(stages=tokenizer + ngrams + vectorizers + assembler + nb) model = build_ngrams(n=2).fit(train_data) preds_valid = model.transform(valid_data) #Evaluate the model. default metric : Area Under ROC..... areaUnderROC:0.609 # with text_clean: 0.607 # with text_clean + build_ngrams(n=2): 0.612 bceval = BinaryClassificationEvaluator() print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid), 3)) #Evaluate the model. metric : Area Under PR...... areaUnderPR:0.732 # with text_clean: 0.728 # with text_clean + build_ngrams(n=2): 0.729 bceval.setMetricName("areaUnderPR") print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid), 3)) #Evaluate the model. metric : F1 score...... f1:0.865 # with text_clean: 0.858 # with text_clean + build_ngrams(n=2): 0.882 mceval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex') # One-hot encoding gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec') # EMBARKED variable embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex') embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec') assembler = VectorAssembler(inputCols=[ 'Pclass', 'SexVec', 'EmbarkVec', 'Age', 'SibSp', 'Parch', 'Fare' ], outputCol='features') model = LogisticRegression(featuresCol='features', labelCol='Survived') # Create a pipeline (series of stages) pipeline = Pipeline(stages=[ gender_indexer, gender_encoder, embark_indexer, embark_encoder, assembler, model ]) train_data, test_data = my_final_data.randomSplit([0.7, 0.3]) fitted_model = pipeline.fit(train_data) # Evaluate on test dataset results = fitted_model.transform(test_data) results.show() eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived') auc = eval.evaluate(results)
pipeline = Pipeline(stages=[rf]) pipelineModel = pipeline.fit(training) #trainingPredictions = pipelineModel.transform(training) #trainingPredictions.show() #trainingPredictions.select("prediction", "label", "features").show() testPredictions = pipelineModel.transform(test) #evaluator = MulticlassClassificationEvaluator( #labelCol="label", predictionCol="prediction", metricName="precision") evaluator = BinaryClassificationEvaluator() from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params evaluatorParaMap = {evaluator.metricName: "areaUnderROC"} #aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap) aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap) # The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained. # k = 3 and k = 10 are common from pyspark.ml.tuning import * paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [10, 30, 50]).build() # println(paramGrid(1)) cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3) # Run cross-validation, and choose the best set of parameters. cvModel = cv.fit(training) cvPredictions = cvModel.transform(test) cvAUCTest = evaluator.evaluate(cvPredictions, evaluatorParaMap) cvPredictions.show() # println("pipeline Training AUC: " + aucTraining)
predictions = out[0].data_frame threshold = {'min_value': 0.3, 'metric': 'areaUnderROC', 'mid_value': 0.7} # replace "label" below with the numeric representation of # the label column that you defined while training the model labelCol = "label" # create evaluator from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(labelCol=labelCol) # compute evaluations eval_fields = { "accuracyScore": predictions.rdd.filter(lambda x: x[labelCol] == x["prediction"]).count() * 1.0 / predictions.count(), "areaUnderPR": evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}), "areaUnderROC": evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}), "thresholdMetric": threshold["metric"], "thresholdMinValue": threshold["min_value"], "thresholdMidValue": threshold["mid_value"] } # feel free to customize to your own performance logic using the values of "good", "poor", and "fair". if(eval_fields[eval_fields["thresholdMetric"]] >= threshold.get('mid_value', 0.70)): eval_fields["performance"] = "good" elif(eval_fields[eval_fields["thresholdMetric"]] <= threshold.get('min_value', 0.25)): eval_fields["performance"] = "poor" else: eval_fields["performance"] = "fair" save_evaluation_metrics(eval_fields, "Breast Cancer Automated RF2", "1", startTime)
def main(argv): start = time.time() #INGEST DATA INTO DATA FRAME OR TEMP. TABLE print "Ingest data..." sc = SparkContext(appName="KaggleDato") sqlContext = SQLContext(sc) train_label_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_TRAIN_LABELS) input_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_JSON) #input_df.printSchema() #train_label_df.printSchema() #input_df.show() #print input_df.count() #Make DF with labels train_wlabels_df = input_df.join(train_label_df,"id") train_wlabels_df.repartition("label") train_wlabels_df.explain #train_wlabels_df.printSchema() #train CV split, stratified sampling #1 is under represented class fractions = {1.0:1.0, 0.0:0.15} stratified = train_wlabels_df.sampleBy("label", fractions, 36L) train, cv = train_wlabels_df.randomSplit([0.7, 0.3]) print "Prepare text features..." # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. #tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") #tokenized_df = tokenizer.transform(train_wlabels_df) #tokenized_df.show() #remove stopwords remover = StopWordsRemover(inputCol="words", outputCol="filtered") #filtered_df = remover.transform(tokenized_df) #filtered_df.printSchema() #filtered_df.show() #try ngrams instead #ngram = NGram(n=2, inputCol="filtered", outputCol="filtered") #ngram_df = ngram.transform(tokenized_df_copy) #Hashing hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) #featurized_df = hashingTF.transform(filtered_df) idf = IDF(inputCol="rawFeatures", outputCol="features") #idfModel = idf.fit(featurized_df) #rescaled_df = idfModel.transform(featurized_df) #rescaled_df.printSchema() #Trying various classifiers here #create a pipeline lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr]) # Train a RandomForest model. #rf = RandomForestClassifier(numTrees=10,impurity="gini",maxDepth=4,maxBins=32) #pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, rf]) #Parameter search grid paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 20, 30]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() #Note that the evaluator here is a BinaryClassificationEvaluator and its default metric #is areaUnderROC. #metricName options are: areaUnderROC|areaUnderPR) ev = BinaryClassificationEvaluator(metricName="areaUnderROC") #Alternative: user multiclass classification evaluator #metricName options are f1, precision, recall #ev = MulticlassClassificationEvaluator(metricName="f1") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=ev, numFolds=2) # use 3+ folds in practice #below is the single pipeline vs parameter search switch # Fit the pipeline to training documents. model = pipeline.fit(train) #model = crossval.fit(train) print "Evaluate model on test instances and compute test error..." prediction = model.transform(cv) prediction.select("id", "text", "probability", "prediction").show(5) accuracy = ev.evaluate(prediction) print "CV Error = " + str(1.0 - accuracy)
"SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges" ], outputCol="features") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) (train, test) = df_proper.randomSplit([0.7, 0.3]) classifier = RandomForestClassifier(labelCol='label', featuresCol='features') pipeline = Pipeline(stages=[labelIndexer, assembler, classifier]) model = pipeline.fit(train) predictions = model.transform(test) evaluator = BinaryClassificationEvaluator() auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}) test = int(auroc) print auroc f = open(sys.argv[2], 'w') f.write("the area under curve for Random Forest Classifier is: " + str(auroc)) f.close()
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [300, 400]).addGrid(lr.regParam, [0.01, 0.1, 1.0]).build() #Set up cross-validation. cv = CrossValidator().setNumFolds(3).setEstimator(pipeline).setEstimatorParamMaps(paramGrid).setEvaluator(BinaryClassificationEvaluator()) #Fit a model with cross-validation. cvModel = cv.fit(trainingData) testTransform = cvModel.transform(testData) predictions = testTransform.select("review", "prediction", "label") predictionsAndLabels = predictions.map(lambda x : (x[1], x[2])) trainErr = predictionsAndLabels.filter(lambda r : r[0] != r[1]).count() / float(testData.count()) print("TrainErr: "+str(trainErr)) evaluator = BinaryClassificationEvaluator() evaluator.evaluate(testTransform, {evaluator.metricName: "areaUnderPR"}) evaluator.evaluate(testTransform, {evaluator.metricName: "areaUnderROC"})
# out = encoder.fit(out).transform(out) # out.show() assembler = VectorAssembler(inputCols=[ 'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites' ], outputCol='features') out = assembler.transform(data) train_data, test_data = out.randomSplit([0.7, 0.3]) lg_model = LogisticRegression(featuresCol='features', labelCol='Churn') model = lg_model.fit(train_data) results = model.evaluate(test_data) # results.predictions.show() eval = BinaryClassificationEvaluator(labelCol='Churn', rawPredictionCol='prediction') auc = eval.evaluate(results.predictions) print(auc) final_model = lg_model.fit(out) new_df = spark.read.csv("./files/new_customers.csv", inferSchema=True, header=True) new_customers = assembler.transform(new_df) final_res = final_model.transform(new_customers) final_res.select(['Company', 'prediction']).show()
trainingPredictions = pipelineModel.transform(training) #trainingPredictions.show() trainingPredictions.select("prediction", "label", "features").show() testPredictions = pipelineModel.transform(test) #evaluator = MulticlassClassificationEvaluator( #labelCol="label", predictionCol="prediction", metricName="precision") evaluator = BinaryClassificationEvaluator() from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params evaluatorParaMap = {evaluator.metricName: "areaUnderROC"} aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap) aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap) print("pipeline Test AUC: %g" % aucTest) from pyspark.ml.tuning import * # The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained. # k = 3 and k = 10 are common #from pyspark.ml.tuning import * #paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [30, 50, 100]).build() #[10, 50, 100]高 50 paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [10,20,30]).addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [30, 50, 100]).build() #(rf.maxDepth, [10,20,30]) #println(paramGrid(1)) #=============#以上未做cv 以下做cv cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3) #setNumFolds(3) # Run cross-validation, and choose the best set of parameters.
udf_strpTime_trainlabel(df_train_label['date'])).drop('date') df_new = df_train_label.join(df_features, 'realdate') df_new = df_new.na.fill(0.0) train, test = df_new.randomSplit([0.80, 0.20]) assembler = VectorAssembler( inputCols = ['realdate', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10', 'e11', 'e12', 'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', 'e20', 'e21', 'e22', 'e23', 'e24', 'e25', 'e26'], outputCol = 'features') lr=LogisticRegression(maxIter=20, regParam=0.1) pipeline=Pipeline(stages=[assembler,lr]) paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) model=crossval.fit(train) prediction=model.transform(test) evaluator=BinaryClassificationEvaluator() print(evaluator.evaluate(prediction))
#build labelled Points from data data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')] dcRDD=sc.parallelize(data_class,numSlices=16) #get the labelled points labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value)) #**************************************************************** #*********************CROSS VALIDATION: 80%/20%****************** #*******************Model: logistic regression******************* #***************************************************************** #create a data frame from an RDD -> features must be Vectors.sparse from pyspark.mllib.linalg sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(labeledRDD, ['features','label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) dfTrain.show() #choose estimator and grid lr = LogisticRegression() #choose the model grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() #the grid is built to find the best paramter 'alpha' for the regularization of the model. It is an elastic net #alpha=0, for a L2 regularization, #alpha=1, for a L1 regularization print "Start Cross validation" evaluator = BinaryClassificationEvaluator() #choose the evaluator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter cvModel = cv.fit(dfTrain) #train the model on the whole training set resultat=evaluator.evaluate(cvModel.transform(dfTest)) #compute the percentage of success on test set print "Percentage of correct predicted labels (0-1): ",resultat
inputCols=categorical, outputCol="features") hasher.transform(df_train).select("features").show() from pyspark.ml.classification import LogisticRegression classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) stages = [hasher, classifier] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) model = pipeline.fit(df_train) predictions = model.transform(df_test) predictions.cache() from pyspark.ml.evaluation import BinaryClassificationEvaluator ev = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName="areaUnderROC") print(ev.evaluate(predictions)) spark.stop()
d6.groupBy("label").count().show(truncate=False) dataArr = d6.randomSplit([0.7, 0.3]) train = dataArr[0] test = dataArr[1] indexer = StringIndexer(inputCol="road", outputCol="roadcode") assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"], outputCol="features") dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") pipeline = Pipeline(stages=[indexer, assembler, dt]) model = pipeline.fit(train) predict = model.transform(test) predict.select("label", "probability", "prediction").show(3, False) # areaUnderROC, areaUnderPR evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC") print(evaluator.evaluate(predict)) treeModel = model.stages[2] print("Learned classification tree model:%s" % treeModel.toDebugString) spark.stop
def main(argv): start = time.time() #INGEST DATA INTO DATA FRAME OR TEMP. TABLE print "Ingest data..." sc = SparkContext(appName="KaggleDato") sqlContext = SQLContext(sc) train_label_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_TRAIN_LABELS) input_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_JSON) #input_df.printSchema() #train_label_df.printSchema() #input_df.show() #Make DF with labels train_wlabels_df = input_df.join(train_label_df,"id") #train CV split, stratified sampling #1 is under represented class fractions = {1.0:1.0, 0.0:1.0} stratified = train_wlabels_df.sampleBy("label", fractions, 36L) stratified = stratified.repartition(200) train, cv = stratified.randomSplit([0.7, 0.3]) print "Prepare text features..." # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. #tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") #remove stopwords remover = StopWordsRemover(inputCol="words", outputCol="filtered") #filtered_df = remover.transform(tokenized_df) #filtered_df.printSchema() #filtered_df.show() #try ngrams instead #ngram = NGram(n=2, inputCol="filtered", outputCol="filtered") #ngram_df = ngram.transform(tokenized_df_copy) #Hashing hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) idf = IDF(inputCol="rawFeatures", outputCol="features") #Trying various classifiers here # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=10,impurity="gini",maxDepth=4,maxBins=32) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, labelIndexer, featureIndexer, rf]) #Note that the evaluator here is a BinaryClassificationEvaluator and its default metric #is areaUnderROC. #metricName options are: areaUnderROC|areaUnderPR) metricName = "areaUnderPR" ev = BinaryClassificationEvaluator(metricName=metricName) #Alternative: user multiclass classification evaluator #metricName options are f1, precision, recall #ev = MulticlassClassificationEvaluator(metricName="f1") # Fit the pipeline to training documents. model = pipeline.fit(train) print "Evaluate model on test instances and compute test error..." prediction = model.transform(cv) #prediction = labelConverter.transform(prediction) prediction.select("label", "text", "probability", "prediction").show(100) result = ev.evaluate(prediction) print metricName,": ", result cvErr = prediction.filter(prediction.label == prediction.prediction).count() / float(cv.count()) print 'CV Error = ' + str(cvErr)
print(testData.count()) # COMMAND ---------- from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(featuresCol="features", labelCol="delayed", regParam=0.01) # COMMAND ---------- lrModel = lr.fit(trainingData) # COMMAND ---------- results = lrModel.transform(testData) # COMMAND ---------- display(results) # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(labelCol="delayed") print(evaluator.evaluate(results)) # COMMAND ---------- lrModel.save("s3a://dbc-mwc/ml_models/flight_delays_lr/")
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), sentence=review_to_wordlist(p[2]))) reviewDF = sqlContext.createDataFrame(review) transformDF = model.transform(reviewDF) selectData = transformDF.select("label","features") (trainingData, testData) = selectData.randomSplit([0.6, 0.4]) lr = LogisticRegression(maxIter=5, regParam=0.01) model = lr.fit(trainingData) result = model.transform(testData) u_lines.unpersist() u_rows.unpersist() u_parts.unpersist() u_review.unpersist() lines.unpersist() rows.unpersist() evaluator = BinaryClassificationEvaluator() evaluator.evaluate(result, {evaluator.metricName: "areaUnderROC"}) evaluator.evaluate(result, {evaluator.metricName: "areaUnderPR"})
best_model = cv.bestModel # Look at the stages in the best model print("Best model:", best_model) # Get the parameters for the LinearRegression object in the best model print("Best model RandomForestClassifier ParamMap:") for k, v in best_model.extractParamMap().items(): print(" ", k.name, "=", v) # Print the RMSE for folds and evaluator print("Average RMSE across all folds:", cv.avgMetrics) # Make predictions on the testing data print("Best Model RMSE: %f" % evaluator.evaluate(best_model.transform(flights_test))) # Average AUC for each parameter combination in grid avg_auc = cv.avgMetrics # Average AUC for the best model best_model_auc = max(cv.avgMetrics) # What's the optimal parameter value? opt_max_depth = cv.bestModel.explainParam('maxDepth') opt_feat_substrat = cv.bestModel.explainParam('featureSubsetStrategy') # AUC for best model on testing data best_auc = evaluator.evaluate(cv.transform(flights_test)) print("avg_auc:", avg_auc)
bst_model_path = model_save_path + "_bst_model" train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345) bst_model = train_with_tune(train_df) bst_model.write().overwrite().save(bst_model_path) # 用训练得到最佳模型来对测试数据进行预测 # 预测结果的数据结构是类似下面的结构: # features = Vectors.dense(...) # label=0, # rawPrediction=DenseVector([0.048, -0.048]), # probability=DenseVector([0.512, 0.488]), # prediction=0.0 loaded_bst_model = PipelineModel.load(bst_model_path) result = loaded_model.transform(train_df) predict_result = loaded_bst_model.transform(test_df) print("predicted sample :", predict_result.take(3)) # 对训练出来的二分类模型进行评估 bin_eval = BinaryClassificationEvaluator() predict_metric = bin_eval.evaluate(predict_result, {bin_eval.metricName: "areaUnderROC"}) print("trained model test auc metric", predict_metric) # 查看具体分类混淆矩阵信息,默认会计算f1 mm = MulticlassClassificationEvaluator() f1 = mm.evaluate(predict_result) accuracy = mm.evaluate(predict_result, {mm.metricName: "accuracy"}) precision = mm.evaluate(predict_result, {mm.metricName: "weightedPrecision"}) recall = mm.evaluate(predict_result, {mm.metricName: "weightedRecall"}) print("predict trained model precision: %f, recall: %f, acc: %s, f1: %f " \ % (precision, recall, accuracy, f1))
# In[21]: fit_model = pipeline.fit(train_titanic_data) # In[22]: results = fit_model.transform(test_titanic_data) # In[23]: from pyspark.ml.evaluation import BinaryClassificationEvaluator # In[24]: my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived') # In[26]: results.select('Survived', 'prediction').show() # In[27]: AUC = my_eval.evaluate(results) # In[28]: AUC # ## Great Job!
# View model's predictions and probabilities of each prediction class selected = predictions.select("label", "prediction", "probability") display(selected) # COMMAND ---------- # MAGIC %md # MAGIC We can make use of the BinaryClassificationEvaluator method to evaluate our model. The Evaluator expects two input columns: (rawPrediction, label). # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") evaluator.evaluate(predictions) # COMMAND ---------- # MAGIC %md Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC # COMMAND ---------- evaluator.getMetricName() # COMMAND ---------- # MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR. # MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR"). # COMMAND ----------
SetLogger(sc) return (sc) sc = CreateSparkContext() print("read data") sqlContext = SQLContext(sc) row_df = sqlContext.read.format("csv").option("header", "true").option("delimiter", "\t").load(Path+"data/train.csv") df= row_df.select(['url','alchemy_category' ]+[col(column).cast("double").alias(column) for column in row_df.columns[4:] ] ) train_df, test_df = df.randomSplit([0.7, 0.3]) train_df.cache() test_df.cache() print("setup pipeline") dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="gini",maxDepth=10, maxBins=14) stringIndexer = StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index") encoder = OneHotEncoder(dropLast=False, inputCol='alchemy_category_Index', outputCol="alchemy_category_IndexVec") assemblerInputs =['alchemy_category_IndexVec'] + row_df.columns[4:-1] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") pipeline = Pipeline(stages=[stringIndexer,encoder ,assembler,dt]) print("train model") pipelineModel = pipeline.fit(train_df) print("predict") predicted=pipelineModel.transform(test_df).select('url','prediction').show(10) print(predicted) print("eval model") evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC" ) predictions =pipelineModel.transform(test_df) auc= evaluator.evaluate(predictions) print(auc)
# You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation selected = predictions.select("label", "prediction", "probability", "age", "occupation") display(selected) # COMMAND ---------- # MAGIC %md # MAGIC We can make use of the BinaryClassificationEvaluator method to evaluate our model. The Evaluator expects two input columns: (rawPrediction, label). # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") evaluator.evaluate(predictions) # COMMAND ---------- # MAGIC %md Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC # COMMAND ---------- evaluator.getMetricName() # COMMAND ---------- # MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR. # MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR"). # COMMAND ----------
train, test = raw_data.randomSplit([0.70, 0.30]) numHosp = train.filter(train["TIPO PACIENTE"] == "HOSPITALIZADO").count() numAmb = train.filter(train["TIPO PACIENTE"] == "AMBULATORIO").count() BalancingRatio = numAmb / (numHosp + numAmb) train = train.withColumn( "classWeights", when(train.label == 1, BalancingRatio).otherwise(1 - BalancingRatio)) model = modeloLogistico(data=train, labelCol="label", featuresCol="features", weightCol="classWeights") modelSummary = model.summary predictions = predictLogistico(test, model) evaluator = BinaryClassificationEvaluator() print("################ EVALUACION DEL MODELO ################") print('AUROC DEL CONJUNTO DE ENTRENAMIENTO: ' + str(modelSummary.areaUnderROC)) print('AUROC DEL CONJUNTO DE PRUEBA: ', evaluator.evaluate(predictions)) print("CLASES:", modelSummary.labels) print("MEDIDA-F", modelSummary.fMeasureByLabel(beta=1.0)) print("TASA DE FALSOS-POSITIVOS:", modelSummary.falsePositiveRateByLabel) print("PRECISION: ", modelSummary.precisionByLabel) print("EXHAUSTIVIDAD: ", modelSummary.recallByLabel) print("TABLA DE CONFUSION: ") print(predictions.crosstab("label", "prediction").show())
FP = preds.filter('prediction = 1 AND label != prediction').count() # Accuracy measures the proportion of correct predictions accuracy = (TN + TP) / (TN + TP + FN + FP) # Calculate precision and recall precision = TP / (TP + FP) recall = TP / (TP + FN) # Find weighted precision multi_evaluator = MulticlassClassificationEvaluator() weighted_precision = multi_evaluator.evaluate(preds, {multi_evaluator.metricName: "weightedPrecision"}) # Find AUC binary_evaluator = BinaryClassificationEvaluator() auc = binary_evaluator.evaluate(preds, {binary_evaluator.metricName: "areaUnderROC"}) #Create a new DataFrame #get metrics in data frame results_inf = spark.createDataFrame(data=[(str(models[x]), auc, accuracy, weighted_precision, precision, recall)]\ ,schema=["Model",'AUC','Accuracy','Weighted_Precision','Precision', 'Recall']) #Append all results in one dataframe results = results.union(results_inf) results.show() results.show() # COMMAND ---------- #Get predictors
train_df = df_combinded[-df_raw_combined['target'].isnull()] test_df = df_combinded[df_raw_combined['target'].isnull()] train_df_sample = train_df.sample(5000, random_state = 0) target_train = train_df_sample['target'] train_data = train_df_sample.drop(['ID'], axis = 1) train_data = sqlContext.createDataFrame(train_data, list(train_data.columns)) assembler = VectorAssembler(inputCols=list(train_data.columns), outputCol='features') train_data = assembler.transform(train_data) lr = LogisticRegression(labelCol="target") model = lr.fit(train_data) prediction = model.transform(train_data) evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC", labelCol="target") print "ROC score: {}".format(evaluator.evaluate(prediction)) log_loss = metrics.log_loss(target_train, list(prediction.probability)) print "log loss: {}".format(log_loss)