def testClassification(data): # Train a GradientBoostedTrees model. stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") si_model = stringIndexer.fit(data) td = si_model.transform(data) rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13) trainData,testData = td.randomSplit([0.8,0.2],13) predictionDF = rf.fit(trainData).transform(testData) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.collect(): print row scoresAndLabels = predictionDF\ .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel)) for sl in scoresAndLabels.collect(): print sl evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC') metric = evaluator.evaluate(selected) print metric
def build_randomForest(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show() rdf = RandomForestClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\ .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator) cvModel = rdf.fit(df) prediction = cvModel.transform(df) prediction.show() print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def build_decisionTree(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show(truncate=False) dt = DecisionTreeClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def evaluate(predictions, spark_metrics): # using sklearn metrics y_hat = predictions.rdd.map(lambda p: p.prediction).collect() y_true = predictions.rdd.map(lambda p: p.label).collect() print metrics.classification_report(y_true, y_hat) print 'AUC score: %f' % metrics.roc_auc_score(y_true, y_hat) print("Accuracy: %f" % metrics.accuracy_score(y_true, y_hat)) # using spark metrics result = [] for metric in spark_metrics: eval = BinaryClassificationEvaluator().setMetricName(metric) result.append(eval.evaluate(predictions)) return result
def pipelineRF(dataDF): """ :param train_data: :return: """ print('pipeline starting...') labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF) featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\ .fit(dataDF) #dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5, # maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy') rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features', maxBins=40,seed=13) pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator]) paramGrid = ParamGridBuilder()\ .addGrid(rfEstimator.maxDepth,[5,10,30])\ .addGrid(rfEstimator.numTrees,[20,50,100]).build() evaluator =BinaryClassificationEvaluator(labelCol='indexLabel', rawPredictionCol='rawPrediction', metricName='areaUnderROC') cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) cvModel = cv.fit(dataDF) print("pipeline end..., cvModel was fit using parameters:\n") pprint(cvModel.explainParams()) predictionDF = cvModel.transform(dataDF) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.take(5): print row aucMetric = evaluator.evaluate(selected) print("auc of test data is:%.3f" % aucMetric)
def buil_lrmodel(path): df = load_data(path) #-------------------- preparing the dataset ------------------------------------------- avg_age = find_avg_age(df) df = data_preparation(df, avg_age) print "count = " , df.count() df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') #------------------ Build a model ---------------------------------------------------- lr = LogisticRegression(maxIter=10, regParam=0.01) model = lr.fit(df) prediction = model.transform(df) prediction.show(truncate=False) evaluator = BinaryClassificationEvaluator() print "classification evaluation :" , evaluator.evaluate(prediction) #-------------- selecting models with cross validation ----------------------------------- lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\ .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def main(): ''' takes one input argument :: Location of the directory for training and test data files. :return: Print output on console for the area under the ROC curve. ''' conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) numFeatures = (1000, 5000, 10000) regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() model_cv = cv.fit(trainDF) prediction_cv = model_cv.transform(testDF) print evaluator.evaluate(prediction) print evaluator.evaluate(prediction_cv)
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
d6.groupBy("label").count().show(truncate=False) dataArr = d6.randomSplit([0.7, 0.3]) train = dataArr[0] test = dataArr[1] indexer = StringIndexer(inputCol="road", outputCol="roadcode") assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"], outputCol="features") dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") pipeline = Pipeline(stages=[indexer, assembler, dt]) model = pipeline.fit(train) predict = model.transform(test) predict.select("label", "probability", "prediction").show(3, False) # areaUnderROC, areaUnderPR evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC") print(evaluator.evaluate(predict)) treeModel = model.stages[2] print("Learned classification tree model:%s" % treeModel.toDebugString) spark.stop
#Spark Model Hyper Turning from pyspark.ml.tuning import CrossValidator from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator #Setting Random Forest Paramaters From Users user_rf_param_numTreeSet = [4, 8, 16, 32, 64] user_rf_param_maxDepthSet = [10, 20, 30] user_rf_param_impuritySet = ['gini', 'entropy'] user_rf_param_numFolds = 3 #Settings for Random Forest - Paramaters Grid Search rf_paramGrid = ParamGridBuilder().addGrid(rfclassifier.numTrees, user_rf_param_numTreeSet).addGrid(rfclassifier.maxDepth, user_rf_param_maxDepthSet).addGrid(rfclassifier.impurity, user_rf_param_impuritySet).build() evaluator = BinaryClassificationEvaluator() multiEvaluator = MulticlassClassificationEvaluator() #Setting Paramaters for Crossvalidation rf_cv = CrossValidator( estimator=pipeline, evaluator=evaluator, estimatorParamMaps=rf_paramGrid, numFolds=user_rf_param_numFolds) rf_cvmodel = rf_cv.fit(train) #Evaluating Random Forest Model Performance from pyspark.sql.functions import udf rf_predictions = rf_cvmodel.transform(test) auroc = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderPR"}) "The AUROC is %s and the AUPR is %s" % (auroc, aupr) f1score = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "f1"})
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. # This will allow us to jointly choose parameters for all Pipeline stages. # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. # We use a ParamGridBuilder to construct a grid of parameters to search over. # With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, # this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) # Prepare test documents, which are unlabeled. test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "mapreduce spark"), (7, "apache hadoop")], ["id", "text"]) # Make predictions on test documents. cvModel uses the best model found (lrModel). prediction = cvModel.transform(test) selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): print(row)
lgpredictions_train = logr_model.transform(pcatrain_df) # In[86]: from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="attrition_class", predictionCol="prediction", metricName="accuracy") lgaccuracy = evaluator.evaluate(lgpredictions) lgaccuracy_train = evaluator.evaluate(lgpredictions_train) print("Test Accuracy = %g" % (lgaccuracy)) print("Train Accuracy = %g" % (lgaccuracy_train)) predictions_and_labels = logr_model.evaluate(pcatest_df) evaluatorroc = BinaryClassificationEvaluator(labelCol="attrition_class") my_final_roc = evaluatorroc.evaluate(predictions_and_labels.predictions) print("AUC Score =", my_final_roc) # In[85]: #ROC import matplotlib.pyplot as plt plt.figure(figsize=(5, 5)) plt.plot([0, 1], [0, 1], 'r--') plt.plot( logr_model.summary.roc.select('FPR').collect(), logr_model.summary.roc.select('TPR').collect()) plt.xlabel('FPR') plt.ylabel('TPR') plt.show()
# COMMAND ---------- # View model's predictions and probabilities of each prediction class # You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation selected = predictions.select("label", "prediction", "probability") display(selected) # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.classification import RandomForestClassifier # COMMAND ---------- evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") evaluator.evaluate(predictions) # COMMAND ---------- tp = selected.where(selected["label"] == 1).where(selected["prediction"] == 1).count() tn = selected.where(selected["label"] == 0).where(selected["prediction"] == 0).count() fp = selected.where(selected["label"] == 0).where(selected["prediction"] == 1).count() fn = selected.where(selected["label"] == 1).where(selected["prediction"] == 0).count() # COMMAND ---------- print(tp) print(tn) print(fp) print(fn)
inputCols = [ 'number_customer_service_calls', \ 'total_night_minutes', \ 'total_day_minutes', \ 'total_eve_minutes', \ 'account_length'], outputCol = 'features') # Transform labels from pyspark.ml.feature import StringIndexer label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label') # Fit the model from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier classifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'features') pipeline = Pipeline(stages=[assembler, label_indexer, classifier]) (train, test) = df.randomSplit([0.7, 0.3]) model = pipeline.fit(train) from pyspark.ml.evaluation import BinaryClassificationEvaluator predictions = model.transform(train) evaluator = BinaryClassificationEvaluator() evaluator.evaluate(predictions)
# Step - 3: Set up the LinearSVC Classifier trainer = MultilayerPerceptronClassifier(labelCol="eatable", featuresCol="features", maxIter=200, seed=1234, layers=layers, blockSize=10, stepSize=0.001) # Step - 4: Train the model model = trainer.fit(output) print(model.weights) # ~ 7600 weights rawPredictions = model.transform(output) predictions = enrichPredictions(rawPredictions) predictions.show(100) # Step - 5: Evaluate prediction evaluator = BinaryClassificationEvaluator(labelCol="eatable", rawPredictionCol="prediction") # Step - 6: Calculate ROC AUC rocAuc = evaluator.evaluate(rawPredictions) print("ROC_AUC = %g " % rocAuc) spark.stop()
splits = df_gender_analysis.randomSplit([0.75, 0.25]) data_train = splits[0] data_test = splits[1] print("The training data has {} instances.".format(data_train.count())) print("The test data has {} instances.".format(data_test.count())) lr = LogisticRegression(maxIter=10, regParam=0.3) # Fit the model lrModel = lr.fit(data_train) trainingSummary = lrModel.summary trainingSummary.roc.show() print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) predictions = lrModel.transform(data_test) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") evaluator.evaluate(predictions) evaluator.getMetricName() # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [90000, 10, 10, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model
# %% [markdown] # ## Prediction on training data # %% pred_training_dtc = dtc_model.transform(training) show_columns = [ 'features', 'label', 'prediction', 'rawPrediction', 'probability' ] pred_training_dtc.select(show_columns).show(5, truncate=True) # %% [markdown] # ## Evaluator # %% from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") print('Accuracy on training data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_training_dtc)) # %% [markdown] # ## Prediction on test data # %% pred_testing_dtc = dtc_model.transform(testing) pred_testing_dtc.select(show_columns).show(5, truncate=True) print('Accuracy on testing data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_testing_dtc)) # %% [markdown] # ## Confusion Matrix
from pyspark.sql import SparkSession from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator sample_test_data_path = 'test_input/logistic_regression/sample_libsvm_data.txt' spark = SparkSession.builder.appName('mylogreg').getOrCreate() data = spark.read.format('libsvm').load(sample_test_data_path) train_data, test_data = data.randomSplit([0.7, 0.3]) mylogreg_model = LogisticRegression() fitted_log_reg_model = mylogreg_model.fit(train_data) # log_summary = fitted_log_reg_model.summary # # log_summary.predictions.show() prediction_and_labels = fitted_log_reg_model.evaluate(test_data) prediction_and_labels.predictions.show() my_eval = BinaryClassificationEvaluator() my_final_roc = my_eval.evaluate(prediction_and_labels.predictions) print my_final_roc
interactor.fit(df_train).transform(df_train).select("features").show() from pyspark.ml.classification import LogisticRegression classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) stages = [interactor, classifier] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) model = pipeline.fit(df_train) predictions = model.transform(df_test) predictions.cache() predictions.show() from pyspark.ml.evaluation import BinaryClassificationEvaluator ev = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName="areaUnderROC") print(ev.evaluate(predictions)) spark.stop()
# step 9 result9_df = result8_transformed splits = result9_df.randomSplit([0.8, 0.2], seed=1) train = splits[0].cache() valid = splits[1].cache() train.show(n) valid.show(n) # step 10 lr = LogisticRegression(regParam=0.01, maxIter=100, fitIntercept=True) bceval = BinaryClassificationEvaluator() cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(n_fold) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, max_iter)\ .addGrid(lr.regParam, reg_params).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(train) print(cvmodel.bestModel.coefficients) print('') print(cvmodel.bestModel.intercept) print('') print(cvmodel.bestModel.getMaxIter()) print('')
# LogisticRegression.transform() will only use the 'features' column. predictions = lrModel.transform(testData) predictions.show() # Puedes ver cuantos predijo mal predictions.groupBy('label','prediction').count().show() # ----------------------------------------------------------------EVALUACION DEL MODELO---------------------------------------------------------- # We can use BinaryClassificationEvaluator to evaluate our model. # We can set the required column names in rawPredictionCol and labelCol Param and the metric in metricName Param. # Evaluate model from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol="rawPrediction", metricName='areaUnderROC') evaluator.evaluate(predictions) print('Test Area Under ROC', evaluator.evaluate(predictions)) # Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC print(lr.explainParams()) # Summary del modelo trainingSummary = lrModel.summary trainingSummary.accuracy trainingSummary.areaUnderROC # Graficas de receiver-operating characteristic and areaUnderROC. roc = trainingSummary.roc.toPandas() plt.figure() plt.plot(roc['FPR'],roc['TPR'], label='ROC curve (area = %0.2f)' % trainingSummary.areaUnderROC)
cm.prediction).count() / cm.count() # Out[51]: 0.8216095682140685 # accuracy def accuracy_m(model): predictions = model.transform(test_data) cm = predictions.select('label', 'prediction') acc = cm.filter(cm.label == cm.prediction).count() / cm.count() print('model accuracy : %.3f%%' % (acc * 100)) accuracy_m(model=linearModel) # model accuracy : 82.161% # use ROC for binary classification ) = True Positive Rate(recall) # TODO : 확인하기 from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction') print(evaluator.evaluate(predictions)) # 0.8952698333157076 print(evaluator.getMetricName()) # areaUnderROC # step 6) tune the hyperparameter ''' To reduce the time of the computation, you only tune the regularization parameter with only two values. ''' from pyspark.ml.tuning import ParamGridBuilder, CrossValidator param_grid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.5]).build()) # time check and kfold=5 from time import * start_time = time()
'abs_title_sentiment_polarity'],outputCol='features' ) new_data = assembler.transform(data) final_data = new_data.select('features','shares') from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result") result = discretizer.fit(final_data).transform(final_data) finalData = result.select('result','features') from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features') train_data,test_data = finalData.randomSplit([0.7,0.3]) rfc_model = rfc.fit(train_data) result = rfc_model.transform(test_data); from pyspark.ml.evaluation import BinaryClassificationEvaluator acc_eval = BinaryClassificationEvaluator(labelCol='result') print(acc_eval.evaluate(result)) test_data.head(1) # import os, sys # import pandas # import plotly.plotly as py # from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # import cufflinks as cf # import plotly.graph_objs as go # init_notebook_mode(connected=True) # sys.path.append("".join([os.environ["HOME"]])) # result.columns # predictions_pdf = result.select('result', 'features', 'rawPrediction', 'probability', 'prediction').toPandas() # cumulative_stats = predictions_pdf.groupby(['prediction']).count()
fpr=0.05) train = css.fit(train).transform(train) test = css.fit(test).transform(test) lr = LogisticRegression(labelCol="Outcome", featuresCol="Aspect", weightCol="classWeights", maxIter=10) model = lr.fit(train) predict_train = model.transform(train) predict_test = model.transform(test) #predict_test.select("Outcome","prediction").show(10) #This is the evaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Outcome") predict_test.select("Outcome", "rawPrediction", "prediction", "probability").show(5) print("The area under ROC for train set is {}".format( evaluator.evaluate(predict_train))) print("Test area under ROC {}".format(evaluator.evaluate(predict_test))) #Modelo numero 2: DecisionTreeClassifier from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(labelCol="Outcome", featuresCol="features") dt_model = dt.fit(train) dt_prediction = dt_model.transform(test) dt_accuracy = evaluator.evaluate(dt_prediction) print("Accuracy of DecisionTreeClassifier is = %g" % (dt_accuracy))
from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.mllib.util import MLUtils rf = RandomForestClassifier(numTrees = 100, maxDepth = 10, maxBins = 128) pipeline = Pipeline(stages=[rf]) pipelineModel = pipeline.fit(training) #trainingPredictions = pipelineModel.transform(training) #trainingPredictions.show() #trainingPredictions.select("prediction", "label", "features").show() testPredictions = pipelineModel.transform(test) #evaluator = MulticlassClassificationEvaluator( #labelCol="label", predictionCol="prediction", metricName="precision") evaluator = BinaryClassificationEvaluator() from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params evaluatorParaMap = {evaluator.metricName: "areaUnderROC"} #aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap) aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap) # The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained. # k = 3 and k = 10 are common from pyspark.ml.tuning import * paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [10, 30, 50]).build() # println(paramGrid(1)) cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3) # Run cross-validation, and choose the best set of parameters. cvModel = cv.fit(training)
# COMMAND ---------- pipeline = Pipeline(stages=imputer + encoder + assembler) tmp = pipeline.fit(airline_delays_train).transform(airline_delays_train) tmp.printSchema() # COMMAND ---------- #define the estimator randForest = RandomForestClassifier(featuresCol='features', labelCol=target) # define the modeling pipeline with formula + feature transofrmations + estimator pipeline = Pipeline(stages=imputer + encoder + assembler + [randForest]) #define binary classification evaluator with right metric evaluator = BinaryClassificationEvaluator(labelCol=target, metricName="areaUnderROC") # Define the parameter grid for random forest param_grid = ParamGridBuilder() \ .addGrid(randForest.numTrees, [10]) \ .addGrid(randForest.maxDepth, [3]) \ .build() cv_model = build_and_tune_model_with_cv(pipeline, param_grid, evaluator, airline_delays_train) # COMMAND ---------- model_summary_rf(cv_model) # COMMAND ----------
def _val(target, model): clf, paramGrid = model evaluator = BinaryClassificationEvaluator(labelCol=target, rawPredictionCol='prediction') # validator = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator) validator = CrossValidator(estimator=clf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) return validator
def main(argv): start = time.time() #INGEST DATA INTO DATA FRAME OR TEMP. TABLE print "Ingest data..." sc = SparkContext(appName="KaggleDato") sqlContext = SQLContext(sc) train_label_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_TRAIN_LABELS) input_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_JSON) #input_df.printSchema() #train_label_df.printSchema() #input_df.show() #print input_df.count() #Make DF with labels train_wlabels_df = input_df.join(train_label_df,"id") train_wlabels_df.repartition("label") train_wlabels_df.explain #train_wlabels_df.printSchema() #train CV split, stratified sampling #1 is under represented class fractions = {1.0:1.0, 0.0:0.15} stratified = train_wlabels_df.sampleBy("label", fractions, 36L) train, cv = train_wlabels_df.randomSplit([0.7, 0.3]) print "Prepare text features..." # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. #tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") #tokenized_df = tokenizer.transform(train_wlabels_df) #tokenized_df.show() #remove stopwords remover = StopWordsRemover(inputCol="words", outputCol="filtered") #filtered_df = remover.transform(tokenized_df) #filtered_df.printSchema() #filtered_df.show() #try ngrams instead #ngram = NGram(n=2, inputCol="filtered", outputCol="filtered") #ngram_df = ngram.transform(tokenized_df_copy) #Hashing hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) #featurized_df = hashingTF.transform(filtered_df) idf = IDF(inputCol="rawFeatures", outputCol="features") #idfModel = idf.fit(featurized_df) #rescaled_df = idfModel.transform(featurized_df) #rescaled_df.printSchema() #Trying various classifiers here #create a pipeline lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr]) # Train a RandomForest model. #rf = RandomForestClassifier(numTrees=10,impurity="gini",maxDepth=4,maxBins=32) #pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, rf]) #Parameter search grid paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 20, 30]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() #Note that the evaluator here is a BinaryClassificationEvaluator and its default metric #is areaUnderROC. #metricName options are: areaUnderROC|areaUnderPR) ev = BinaryClassificationEvaluator(metricName="areaUnderROC") #Alternative: user multiclass classification evaluator #metricName options are f1, precision, recall #ev = MulticlassClassificationEvaluator(metricName="f1") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=ev, numFolds=2) # use 3+ folds in practice #below is the single pipeline vs parameter search switch # Fit the pipeline to training documents. model = pipeline.fit(train) #model = crossval.fit(train) print "Evaluate model on test instances and compute test error..." prediction = model.transform(cv) prediction.select("id", "text", "probability", "prediction").show(5) accuracy = ev.evaluate(prediction) print "CV Error = " + str(1.0 - accuracy)
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features') rfc = RandomForestClassifier(numTrees=25, labelCol='PrivateIndex', featuresCol='features') gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features') dtcModel = dtc.fit(train_data) rfcModel = rfc.fit(train_data) gbtModel = gbt.fit(train_data) dtcPred = dtcModel.transform(test_data) rfcPred = rfcModel.transform(test_data) gbtPred = gbtModel.transform(test_data) from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator binaryEval = BinaryClassificationEvaluator(labelCol='PrivateIndex') multiEval = MulticlassClassificationEvaluator(metricName='accuracy') print('DTC Accuracy:') binaryEval.evaluate(dtcPred) print('RFC Accuracy:') binaryEval.evaluate(rfcPred) print('GBT Accuracy:') binaryEval.evaluate(gbtPred) cols = [ 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate'
#===========#what does it mean from here # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and rf (random forest). #rf = RandomForestClassifier().setMaxBins(70) rf = RandomForestClassifier(numTrees=100, maxDepth=20, labelCol="label") #maxDepth=20, maxBins=64, pipeline = Pipeline(stages=[rf]) pipelineModel = pipeline.fit(training) trainingPredictions = pipelineModel.transform(training) #trainingPredictions.show() trainingPredictions.select("prediction", "label", "features").show() testPredictions = pipelineModel.transform(test) #evaluator = MulticlassClassificationEvaluator( #labelCol="label", predictionCol="prediction", metricName="precision") evaluator = BinaryClassificationEvaluator() from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params evaluatorParaMap = {evaluator.metricName: "areaUnderROC"} aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap) aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap) print("pipeline Test AUC: %g" % aucTest) from pyspark.ml.tuning import * # The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained. # k = 3 and k = 10 are common #from pyspark.ml.tuning import * #paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [30, 50, 100]).build() #[10, 50, 100]高 50 paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [10, 20, 30]).addGrid(
["contact_type", "driver_gender", "driver_race", "drugs_related_stop", "highway_type", "officer_gender", "officer_race", "search_conducted", "stop_outcome"]) df_hot = oneHotEncodeColumns(df_string, ['contact_type', 'driver_race', 'highway_type', 'officer_race', 'driver_gender', 'officer_gender']) input_cols = ['stop_hour', 'id', 'drugs_related_stop', 'search_conducted', 'stop_date_year', 'stop_date_month', 'stop_date_dayofmonth', 'stop_date_weekofyear', 'county_fips', 'driver_age', 'officer_id', 'road_number', 'milepost', 'lat', 'lon', 'contact_type', 'driver_race', 'highway_type', 'driver_gender', 'officer_gender', 'officer_race', 'gender_diff', 'race_diff', 'time_of_day'] va = VectorAssembler(outputCol="features", inputCols=input_cols) df_assembled = va.transform(df_hot).select("features", "stop_outcome").withColumnRenamed("stop_outcome", "label") splits = df_assembled.randomSplit([0.8, 0.2]) df_train = splits[0].cache() df_test = splits[1].cache() lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True) lrmodel = lr.fit(df_train) validPredicts = lrmodel.transform(df_test) mceval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = mceval.evaluate(validPredicts) bceval = BinaryClassificationEvaluator(metricName="areaUnderPR") print("Area Under PR Curve: %g" % bceval.evaluate(validPredicts)) print("Test Error: %g" % (1.0-accuracy)) lrmodel.save("log_rand_local")
print(data_df.count()) print(trainingData.count()) print(testData.count()) # Entrenar un RandomForestClassifier model. #dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=5) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # parameter grid from pyspark.ml.tuning import ParamGridBuilder # feature 35 tiene 65 valores diferentes, por defecto 32,se amplia maxBins (número de hojas) a minimo 65 param_grid = ParamGridBuilder().addGrid(dt.maxBins, [65, 68, 71]).addGrid(dt.maxDepth, [4, 6, 8]).build() # evaluator binario evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName="areaUnderROC") # build cross-validation model, 4 iteracciones from pyspark.ml.tuning import CrossValidator cv = CrossValidator(estimator=dt, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4) # construccion del Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, cv]) # entrenar el pipeline, con los datos de entrenamiento el estimador genera el modelo. model = pipeline.fit(trainingData) #tiempo generación del modelo timeendModel = datetime.datetime.now() # Generamos el dataframe con las predicciones a partir de los datos de test y del modelo anterior
# string-index the label column into a column named "label" si3 = StringIndexer(inputCol=' income', outputCol='label') # assemble the encoded feature columns in to a column named "features" assembler = VectorAssembler( inputCols=['ed-encoded', 'ms-encoded', ' hours-per-week'], outputCol="features") # put together the pipeline pipe = Pipeline(stages=[si1, ohe1, si2, ohe2, si3, assembler, lr]) # train the model model = pipe.fit(train) # make prediction pred = model.transform(test) # evaluate. note only 2 metrics are supported out of the box by Spark ML. bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction') au_roc = bce.setMetricName('areaUnderROC').evaluate(pred) au_prc = bce.setMetricName('areaUnderPR').evaluate(pred) print("Area under ROC: {}".format(au_roc)) print("Area Under PR: {}".format(au_prc)) # Log the metrics run_logger.log("AU ROC", au_roc) run_logger.log("AU PRC", au_prc) print("******** SAVE THE MODEL ***********") model.write().overwrite().save("./outputs/AdultCensus.mml")
# drop all missing data my_final_data = my_cols.na.drop() gender_indexer =StringIndexer(inputCol='Sex', outputCol='SexIndex') gender_encoder = OneHotEncoder(inputCol="SexIndex", outputCol='SexVec') embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex') embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkVec') assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'EmbarkVec', 'Age', 'SibSp', 'Parch', 'Fare'], outputCol='features') log_reg_titantic = LogisticRegression(featuresCol='features', labelCol='Survived') pipeline = Pipeline(stages=[gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, log_reg_titantic]) train_data, test_data = my_final_data.randomSplit([0.7, 0.3]) fit_model = pipeline.fit(train_data) results = fit_model.transform(test_data) my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived') # results.select('Survived', 'prediction').show() AUC = my_eval.evaluate(results) print "this is AUC: {}".format(AUC)
train = splits[0] test = splits[1] # specify layers for the neural network: # input layer of size 6 (features), two intermediate of size 6 and 4 # and output of size 2 (classes) layers = [6, 10, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select('prediction', 'label') evaluator = MulticlassClassificationEvaluator(metricName='accuracy') print('Test set accuracy = ' + str(evaluator.evaluate(predictionAndLabels))) #Calcular AUC evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction') evaluation = evaluator.evaluate(model.transform(test)) print('AUC:', evaluation) #Detener sc.stop()
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), sentence=review_to_wordlist(p[2]))) reviewDF = sqlContext.createDataFrame(review) transformDF = model.transform(reviewDF) selectData = transformDF.select("label","features") (trainingData, testData) = selectData.randomSplit([0.6, 0.4]) lr = LogisticRegression(maxIter=5, regParam=0.01) model = lr.fit(trainingData) result = model.transform(testData) u_lines.unpersist() u_rows.unpersist() u_parts.unpersist() u_review.unpersist() lines.unpersist() rows.unpersist() evaluator = BinaryClassificationEvaluator() evaluator.evaluate(result, {evaluator.metricName: "areaUnderROC"}) evaluator.evaluate(result, {evaluator.metricName: "areaUnderPR"})
train = train.withColumnRenamed("clean", "label") training_spark_df_binary, testing_spark_df_binary = train.randomSplit( [0.8, 0.2], seed=2018) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures,[1000])\ .addGrid(lr.regParam, [0.1])\ .addGrid(lr.elasticNetParam, [0.3])\ .build() crossval = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator().setMetricName( 'areaUnderPR' ), # set area Under precision-recall curve as the evaluation metric # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) cvModel = crossval.fit(training_spark_df_binary) cvModel.bestModel.write().overwrite().save("LogisticRegressionModel") # read pickled model via pipeline api from pyspark.ml.pipeline import PipelineModel persistedModel = PipelineModel.load("LogisticRegressionModel") train_prediction = persistedModel.transform(training_spark_df_binary) test_prediction = persistedModel.transform(testing_spark_df_binary) otherDatasetTest = persistedModel.transform(ldt)
bst_model_path = model_save_path + "_bst_model" train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345) bst_model = train_with_tune(train_df) bst_model.write().overwrite().save(bst_model_path) # 用训练得到最佳模型来对测试数据进行预测 # 预测结果的数据结构是类似下面的结构: # features = Vectors.dense(...) # label=0, # rawPrediction=DenseVector([0.048, -0.048]), # probability=DenseVector([0.512, 0.488]), # prediction=0.0 loaded_bst_model = PipelineModel.load(bst_model_path) result = loaded_model.transform(train_df) predict_result = loaded_bst_model.transform(test_df) print("predicted sample :", predict_result.take(3)) # 对训练出来的二分类模型进行评估 bin_eval = BinaryClassificationEvaluator() predict_metric = bin_eval.evaluate(predict_result, {bin_eval.metricName: "areaUnderROC"}) print("trained model test auc metric", predict_metric) # 查看具体分类混淆矩阵信息,默认会计算f1 mm = MulticlassClassificationEvaluator() f1 = mm.evaluate(predict_result) accuracy = mm.evaluate(predict_result, {mm.metricName: "accuracy"}) precision = mm.evaluate(predict_result, {mm.metricName: "weightedPrecision"}) recall = mm.evaluate(predict_result, {mm.metricName: "weightedRecall"}) print("predict trained model precision: %f, recall: %f, acc: %s, f1: %f " \ % (precision, recall, accuracy, f1))
def evaluate_roc_auc(predictions, sqlc): raw = scores_and_labels(predictions, sqlc) evaluator = BinaryClassificationEvaluator() return evaluator.evaluate(raw)
#build labelled Points from data data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')] dcRDD=sc.parallelize(data_class,numSlices=16) #get the labelled points labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value)) #**************************************************************** #*********************CROSS VALIDATION: 80%/20%****************** #*******************Model: logistic regression******************* #***************************************************************** #create a data frame from an RDD -> features must be Vectors.sparse from pyspark.mllib.linalg sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(labeledRDD, ['features','label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) dfTrain.show() #choose estimator and grid lr = LogisticRegression() #choose the model grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() #the grid is built to find the best paramter 'alpha' for the regularization of the model. It is an elastic net #alpha=0, for a L2 regularization, #alpha=1, for a L1 regularization print "Start Cross validation" evaluator = BinaryClassificationEvaluator() #choose the evaluator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter cvModel = cv.fit(dfTrain) #train the model on the whole training set resultat=evaluator.evaluate(cvModel.transform(dfTest)) #compute the percentage of success on test set print "Percentage of correct predicted labels (0-1): ",resultat
adultvalid = splits[1].cache() from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True) lrmodel = lr.fit(adulttrain) lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain) lrmodel.weights lrmodel.intercept #section 8.2.3 validpredicts = lrmodel.transform(adultvalid) from pyspark.ml.evaluation import BinaryClassificationEvaluator bceval = BinaryClassificationEvaluator() bceval.evaluate(validpredicts) bceval.getMetricName() bceval.setMetricName("areaUnderPR") bceval.evaluate(validpredicts) #section 8.2.5 from pyspark.ml.tuning import CrossValidator from pyspark.ml.tuning import ParamGridBuilder cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(adulttrain) cvmodel.bestModel.weights BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))
featuresCol='features', numTrees=150) dtc_model = dtc.fit(train_data) gbt_model = gbt.fit(train_data) rfc_model = rfc.fit(train_data) # Get the predictions dtc_preds = dtc_model.transform(test_data) gbt_preds = gbt_model.transform(test_data) rfc_preds = rfc_model.transform(test_data) # Show the predictions dtc_preds.show() gbt_preds.show() rfc_preds.show() # Evaluate the models binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex') # GBT only outputs predictions, not the raw predictions, so we need to specifiy this in the BinaryClassificationEvaluator binary_eval_gbt = BinaryClassificationEvaluator(labelCol='PrivateIndex', rawPredictionCol='prediction') print('DTC: ') print(binary_eval.evaluate(dtc_preds)) print('RFC: ') print(binary_eval.evaluate(rfc_preds)) print('GBT: ') print(binary_eval_gbt.evaluate(gbt_preds))
def main(argv): start = time.time() #INGEST DATA INTO DATA FRAME OR TEMP. TABLE print "Ingest data..." sc = SparkContext(appName="KaggleDato") sqlContext = SQLContext(sc) train_label_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_TRAIN_LABELS) input_df = sqlContext.read.format('com.databricks.spark.avro').load(PATH_TO_JSON) #input_df.printSchema() #train_label_df.printSchema() #input_df.show() #Make DF with labels train_wlabels_df = input_df.join(train_label_df,"id") #train CV split, stratified sampling #1 is under represented class fractions = {1.0:1.0, 0.0:1.0} stratified = train_wlabels_df.sampleBy("label", fractions, 36L) stratified = stratified.repartition(200) train, cv = stratified.randomSplit([0.7, 0.3]) print "Prepare text features..." # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. #tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") #remove stopwords remover = StopWordsRemover(inputCol="words", outputCol="filtered") #filtered_df = remover.transform(tokenized_df) #filtered_df.printSchema() #filtered_df.show() #try ngrams instead #ngram = NGram(n=2, inputCol="filtered", outputCol="filtered") #ngram_df = ngram.transform(tokenized_df_copy) #Hashing hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) idf = IDF(inputCol="rawFeatures", outputCol="features") #Trying various classifiers here # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=2) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=10,impurity="gini",maxDepth=4,maxBins=32) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, labelIndexer, featureIndexer, rf]) #Note that the evaluator here is a BinaryClassificationEvaluator and its default metric #is areaUnderROC. #metricName options are: areaUnderROC|areaUnderPR) metricName = "areaUnderPR" ev = BinaryClassificationEvaluator(metricName=metricName) #Alternative: user multiclass classification evaluator #metricName options are f1, precision, recall #ev = MulticlassClassificationEvaluator(metricName="f1") # Fit the pipeline to training documents. model = pipeline.fit(train) print "Evaluate model on test instances and compute test error..." prediction = model.transform(cv) #prediction = labelConverter.transform(prediction) prediction.select("label", "text", "probability", "prediction").show(100) result = ev.evaluate(prediction) print metricName,": ", result cvErr = prediction.filter(prediction.label == prediction.prediction).count() / float(cv.count()) print 'CV Error = ' + str(cvErr)
sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() print evaluator.evaluate(prediction) '''sbaronia - setting up parameters using ParamGridBuilder with 3 different features and 9 diff regParam''' param_Grid = (ParamGridBuilder() .addGrid(hashingTF.numFeatures, [1000, 5000, 10000]) .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) .build()) '''sbaronia - creating a new CrossValidator that will use above parameters and use same evaluator with 2 folds cross validation''' cross_val = (CrossValidator() .setEstimator(pipeline)
print("Intercepto: ", str(model.interceptVector)) # predicciones con el conjunto de prueba predictions = predictLogistico(test, model) modelSummary = model.summary roc = modelSummary.roc.toPandas() plt.plot(roc['FPR'], roc['TPR']) plt.ylabel('False Positive Rate') plt.xlabel('True Positive Rate') plt.title('ROC Curve') plt.show() print('Training set areaUnderROC: ' + str(modelSummary.areaUnderROC)) evaluator = BinaryClassificationEvaluator() print('Test Area Under ROC', evaluator.evaluate(predictions)) pr = modelSummary.pr.toPandas() plt.plot(pr['recall'], pr['precision']) plt.ylabel('Precision') plt.xlabel('Recall') plt.show() # Matriz de confusión en las predicciones del test set predictions.crosstab("label", "prediction").show() # otras métricas de evaluación print("CLASES:", modelSummary.labels) print("MEDIDA-F", modelSummary.fMeasureByLabel(beta=1.0)) print("TASA DE FALSOS-POSITIVOS:", modelSummary.falsePositiveRateByLabel)
#model22.numFeatures training2 = model22.transform(training) PredictionsandLabels = training2.select('prediction', 'type1').rdd PredictionsandLabels.collect() # -------------------------------------------------------------- #Resubstitution approach from pyspark.mllib.evaluation import MulticlassMetrics metrics1 = MulticlassMetrics(PredictionsandLabels) metrics1.accuracy # -------------------------------------------------------------------------- # 1 step calculate cv score for 1 model from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator2 = BinaryClassificationEvaluator(labelCol='type1', rawPredictionCol='prediction') paramGrid = ParamGridBuilder().addGrid( df1.maxDepth, [2, 3, 4]).build() #,5,6,7,8,10,15,20]).build() crossval2 = CrossValidator(estimator=df1, estimatorParamMaps=paramGrid, evaluator=evaluator2, numFolds=10) model27 = crossval2.fit(training) model27.bestModel model27.avgMetrics training2 = model27.transform(training) # CV / Parameter Tuning approach --------------------------------------------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [300, 400]).addGrid(lr.regParam, [0.01, 0.1, 1.0]).build() #Set up cross-validation. cv = CrossValidator().setNumFolds(3).setEstimator(pipeline).setEstimatorParamMaps(paramGrid).setEvaluator(BinaryClassificationEvaluator()) #Fit a model with cross-validation. cvModel = cv.fit(trainingData) testTransform = cvModel.transform(testData) predictions = testTransform.select("review", "prediction", "label") predictionsAndLabels = predictions.map(lambda x : (x[1], x[2])) trainErr = predictionsAndLabels.filter(lambda r : r[0] != r[1]).count() / float(testData.count()) print("TrainErr: "+str(trainErr)) evaluator = BinaryClassificationEvaluator() evaluator.evaluate(testTransform, {evaluator.metricName: "areaUnderPR"}) evaluator.evaluate(testTransform, {evaluator.metricName: "areaUnderROC"})
model_rf = MLPipelineModel.load(model_path) # generate predictions startTime = int(time.time()) out = model_rf.transform(SparkDataSources({'nodeADP': dataframe})) predictions = out[0].data_frame threshold = {'min_value': 0.3, 'metric': 'areaUnderROC', 'mid_value': 0.7} # replace "label" below with the numeric representation of # the label column that you defined while training the model labelCol = "label" # create evaluator from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(labelCol=labelCol) # compute evaluations eval_fields = { "accuracyScore": predictions.rdd.filter(lambda x: x[labelCol] == x["prediction"]).count() * 1.0 / predictions.count(), "areaUnderPR": evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}), "areaUnderROC": evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}), "thresholdMetric": threshold["metric"], "thresholdMinValue": threshold["min_value"], "thresholdMidValue":
#===========#what does it mean from here # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and rf (random forest). #rf = RandomForestClassifier().setMaxBins(70) rf = RandomForestClassifier(numTrees=100, maxDepth=20, labelCol="label") #maxDepth=20, maxBins=64, pipeline = Pipeline(stages=[rf]) pipelineModel = pipeline.fit(training) trainingPredictions = pipelineModel.transform(training) #trainingPredictions.show() trainingPredictions.select("prediction", "label", "features").show() testPredictions = pipelineModel.transform(test) #evaluator = MulticlassClassificationEvaluator( #labelCol="label", predictionCol="prediction", metricName="precision") evaluator = BinaryClassificationEvaluator() from pyspark.mllib.linalg import Vectors from pyspark.ml.classification import LogisticRegression from pyspark.ml.param import Param, Params evaluatorParaMap = {evaluator.metricName: "areaUnderROC"} aucTraining = evaluator.evaluate(trainingPredictions, evaluatorParaMap) aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap) print("pipeline Test AUC: %g" % aucTest) from pyspark.ml.tuning import * # The multiplies out to (2 x 3 x 3) x 10 = 180 different models being trained. # k = 3 and k = 10 are common #from pyspark.ml.tuning import * #paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['entropy', 'gini']).addGrid(rf.numTrees, [30, 50, 100]).build() #[10, 50, 100]高 50
tokenizer = Tokenizer(inputCol="text", outputCol="words") hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf") idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5) labels = StringIndexer(inputCol="original", outputCol="label") lines = Pipeline(stages=[tokenizer, hashtf, idf, labels]) # For creating the training, validation, and test models linesFit = lines.fit(trainSet) trainModel = linesFit.transform(trainSet) validationModel = linesFit.transform(valSet) # Train and check the model lr = LogisticRegression(maxIter=100) model = lr.fit(trainModel) predictions = model.transform(validationModel) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") predictions.show(30) #show the label of the indexed labels converter = IndexToString(inputCol="label", outputCol="label meaning") converted = converter.transform(predictions.select("label").distinct()) converted.select("label", "label meaning").distinct().show() #calculate the precision and recall truePositive = predictions[(predictions.label == 0) & (predictions.prediction == 0)].count() trueNegative = predictions[(predictions.label == 1) & (predictions.prediction == 1)].count() falsePositive = predictions[(predictions.label == 1) & (predictions.prediction == 0)].count() falseNegative = predictions[(predictions.label == 0)
# View model's predictions and probabilities of each prediction class # You can select any columns in the above schema to view as well. For example's sake we will choose age & occupation selected = predictions.select("label", "prediction", "probability", "age", "occupation") display(selected) # COMMAND ---------- # MAGIC %md # MAGIC We can make use of the BinaryClassificationEvaluator method to evaluate our model. The Evaluator expects two input columns: (rawPrediction, label). # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") evaluator.evaluate(predictions) # COMMAND ---------- # MAGIC %md Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC # COMMAND ---------- evaluator.getMetricName() # COMMAND ---------- # MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR. # MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR").
-------------------------------------------------- # Exercise_9 from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator # Calculate precision and recall precision = TP / (TP + FP) recall = TP / (TP + FN) print('precision = {:.2f}\nrecall = {:.2f}'.format(precision, recall)) # Find weighted precision multi_evaluator = MulticlassClassificationEvaluator() weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"}) # Find AUC binary_evaluator = BinaryClassificationEvaluator() auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"}) -------------------------------------------------- # Exercise_10 # Import the necessary functions from pyspark.sql.functions import regexp_replace from pyspark.ml.feature import Tokenizer # Remove punctuation (REGEX provided) and numbers wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' ')) wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' ')) # Merge multiple spaces wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))
def main(sqlContext): """Main function takes a Spark SQL context.""" # YOUR CODE HERE # YOU MAY ADD OTHER FUNCTIONS AS NEEDED # load files label = sqlContext.read.load("labeled_data.csv", format="csv", sep=",", inferSchema="true", header="true") if (flag): comments = sqlContext.read.json("comments-minimal.json.bz2") submissions = sqlContext.read.json("submissions.json.bz2") print("loading done") comments.write.parquet("comments_data") submissions.write.parquet("submissions_data") print("writing done") else: comments = sqlContext.read.parquet("comments") submissions = sqlContext.read.parquet("submissions") print("loading done") comments.show() exit() if (save): # task 7 starts here associated = join(comments, label) withngrams = associated.withColumn("ngrams", makeNgrams_udf(associated['body'])) withplabels = withngrams.withColumn("poslabel", pLabel_udf(withngrams['labeldjt'])) withpnlabels = withplabels.withColumn( "neglabel", nLabel_udf(withplabels['labeldjt'])).select( "id", "ngrams", "poslabel", "neglabel") # withpnlabels.show() cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features") model = cv.fit(withpnlabels) model.save("cv.model") # model.transform(withpnlabels).show() pos = model.transform(withpnlabels).select( "id", col("poslabel").alias("label"), "features") neg = model.transform(withpnlabels).select( "id", col("neglabel").alias("label"), "features") # pos.show() # neg.show() poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=2) # for test negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=2) # for test posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) posModel.save("pos.model") negModel.save("neg.model") print("trained") else: # comments.show() # submissions.show() posModel = CrossValidatorModel.load("pos.model") negModel = CrossValidatorModel.load("neg.model") model = CountVectorizerModel.load("cv.model") # withngrams = comments.withColumn("ngrams", makeNgrams_udf(comments['body'])) # cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features") # model = cv.fit(withngrams) print("model loaded") if (predict == 0): # task 8 starts here temp_comments = comments.select("id", "link_id", "author_flair_text", "created_utc", "body") clean_comments = temp_comments.withColumn( "true_id", getLinkid_udf(temp_comments['link_id'])) # print(clean_comments.count()) clean_submissions = submissions.select( col("id").alias("sub_id"), "title") # clean_comments.show() # clean_submissions.show() com_sub = clean_comments.join( clean_submissions, clean_comments.true_id == clean_submissions.sub_id, "inner") com_sub.write.parquet("com_sub") else: # task 9 starts here com_sub = sqlContext.read.parquet("com_sub") com_sub = com_sub.sample(False, 0.0001, None) filtered = com_sub.filter( "body NOT LIKE '%/s%' and body NOT LIKE '>%'") # print(filtered.count()) filtered_ngrams = filtered.withColumn( "ngrams", makeNgrams_udf(filtered['body'])) # filtered_ngrams = filtered_ngrams.sample(False, 0.01, None) print("prepared") featuredata = model.transform(filtered_ngrams).select( "id", "author_flair_text", "created_utc", "sub_id", "title", "features") posResult = posModel.transform(featuredata) negResult = negModel.transform(featuredata) # posResult.show() # negResult.show() poslabel = posResult.withColumn( "positive", posTh_udf(posResult['probability']) ) # .select("id", "author_flair_text", "created_utc", "title", "positive") neglabel = negResult.withColumn( "negtive", negTh_udf(negResult['probability']) ) # .select(col("id").alias("nid"), "author_flair_text", "created_utc", "title", "negtive") print("predict done") # poslabel.show() # neglabel.show() # how to combine these 2 tables??? # task 10 starts here # c_all = poslabel.count() all_day = poslabel.withColumn( "date", from_unixtime('created_utc').cast( DateType())).groupby("date").count() pos_posts = poslabel.filter("positive = 1") # c_pos_posts = pos_posts.count() # p_pos_posts = c_pos_posts/c_all # print(p_pos_posts) # neg_posts = neglabel.filter("negtive = 1") # c_neg_posts = neg_posts.count() # p_neg_posts = c_neg_posts/c_all # print(p_neg_posts) pos_day = pos_posts.withColumn( "pos_date", from_unixtime('created_utc').cast( DateType())).groupby("pos_date").count().withColumnRenamed( "count", "pos_count") p_pos_day = all_day.join(pos_day, all_day.date == pos_day.pos_date, "left").withColumn( "pos_per", pos_count / count).show() print("end")
train_df = df_combinded[-df_raw_combined['target'].isnull()] test_df = df_combinded[df_raw_combined['target'].isnull()] train_df_sample = train_df.sample(5000, random_state = 0) target_train = train_df_sample['target'] train_data = train_df_sample.drop(['ID'], axis = 1) train_data = sqlContext.createDataFrame(train_data, list(train_data.columns)) assembler = VectorAssembler(inputCols=list(train_data.columns), outputCol='features') train_data = assembler.transform(train_data) lr = LogisticRegression(labelCol="target") model = lr.fit(train_data) prediction = model.transform(train_data) evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC", labelCol="target") print "ROC score: {}".format(evaluator.evaluate(prediction)) log_loss = metrics.log_loss(target_train, list(prediction.probability)) print "log loss: {}".format(log_loss)