def regression(df, column, name): try: model = CrossValidatorModel.load("data/{}.model".format(name)) except: LR = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) if name[3] == 'P': LR.setThreshold(0.2) else: LR.setThreshold(0.25) eval = BinaryClassificationEvaluator() paramGrid = ParamGridBuilder().addGrid(LR.regParam, [1.0]).build() crossval = CrossValidator(estimator=LR, evaluator=eval, estimatorParamMaps=paramGrid, numFolds=5) # train, test = df.select('features', func.col(column).alias("label")).randomSplit([0.5, 0.5]) print("Training '{}' classifier... Please wait".format(name)) model = crossval.fit(df.select("*", func.col(column).alias("label"))) model.save("data/{}.model".format(name)) # df_test = model.transform(df) # df_test.filter(df_test.prediction == 1).show() return model
def test_default_read_write(self): temp_path = tempfile.mkdtemp() lr = LogisticRegression() lr.setMaxIter(50) lr.setThreshold(0.75) writer = DefaultParamsWriter(lr) savePath = temp_path + "/lr" writer.save(savePath) reader = DefaultParamsReadable.read() lr2 = reader.load(savePath) self.assertEqual(lr.uid, lr2.uid) self.assertEqual(lr.extractParamMap(), lr2.extractParamMap()) # test overwrite lr.setThreshold(0.8) writer.overwrite().save(savePath) reader = DefaultParamsReadable.read() lr3 = reader.load(savePath) self.assertEqual(lr.uid, lr3.uid) self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
def test_default_read_write(self): temp_path = tempfile.mkdtemp() lr = LogisticRegression() lr.setMaxIter(50) lr.setThreshold(.75) writer = DefaultParamsWriter(lr) savePath = temp_path + "/lr" writer.save(savePath) reader = DefaultParamsReadable.read() lr2 = reader.load(savePath) self.assertEqual(lr.uid, lr2.uid) self.assertEqual(lr.extractParamMap(), lr2.extractParamMap()) # test overwrite lr.setThreshold(.8) writer.overwrite().save(savePath) reader = DefaultParamsReadable.read() lr3 = reader.load(savePath) self.assertEqual(lr.uid, lr3.uid) self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
def test_default_read_write_default_params(self): lr = LogisticRegression() self.assertFalse(lr.isSet(lr.getParam("threshold"))) lr.setMaxIter(50) lr.setThreshold(0.75) # `threshold` is set by user, default param `predictionCol` is not set by user. self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) writer = DefaultParamsWriter(lr) metadata = json.loads(writer._get_metadata_to_save(lr, self.sc)) self.assertTrue("defaultParamMap" in metadata) reader = DefaultParamsReadable.read() metadataStr = json.dumps(metadata, separators=[",", ":"]) loadedMetadata = reader._parseMetaData( metadataStr, ) reader.getAndSetParams(lr, loadedMetadata) self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) # manually create metadata without `defaultParamMap` section. del metadata["defaultParamMap"] metadataStr = json.dumps(metadata, separators=[",", ":"]) loadedMetadata = reader._parseMetaData( metadataStr, ) with self.assertRaisesRegex(AssertionError, "`defaultParamMap` section not found"): reader.getAndSetParams(lr, loadedMetadata) # Prior to 2.4.0, metadata doesn't have `defaultParamMap`. metadata["sparkVersion"] = "2.3.0" metadataStr = json.dumps(metadata, separators=[",", ":"]) loadedMetadata = reader._parseMetaData( metadataStr, ) reader.getAndSetParams(lr, loadedMetadata)
def test_default_read_write_default_params(self): lr = LogisticRegression() self.assertFalse(lr.isSet(lr.getParam("threshold"))) lr.setMaxIter(50) lr.setThreshold(.75) # `threshold` is set by user, default param `predictionCol` is not set by user. self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) writer = DefaultParamsWriter(lr) metadata = json.loads(writer._get_metadata_to_save(lr, self.sc)) self.assertTrue("defaultParamMap" in metadata) reader = DefaultParamsReadable.read() metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) reader.getAndSetParams(lr, loadedMetadata) self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) # manually create metadata without `defaultParamMap` section. del metadata['defaultParamMap'] metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"): reader.getAndSetParams(lr, loadedMetadata) # Prior to 2.4.0, metadata doesn't have `defaultParamMap`. metadata['sparkVersion'] = '2.3.0' metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) reader.getAndSetParams(lr, loadedMetadata)
# COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ####Logistic Regression - Train # COMMAND ---------- from pyspark.ml.classification import LogisticRegression # Create initial LogisticRegression model lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # set threshold for the probability above which to predict a 1 lr.setThreshold(training_data_positive_rate) # lr.setThreshold(0.5) # could use this if knew you had balanced data # Train model with Training Data lrModel = lr.fit(trainingData) # get training summary used for eval metrics and other params lrTrainingSummary = lrModel.summary # Find the best model threshold if you would like to use that instead of the empirical positve rate fMeasure = lrTrainingSummary.fMeasureByThreshold maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() lrBestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold']
objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective) # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. trainingSummary.roc.show() print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) # Set the model threshold to maximize F-Measure fMeasure = trainingSummary.fMeasureByThreshold maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) # We can also use the multinomial family for binary classification mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial") # Fit the model mlrModel = mlr.fit(left_join) # Print the coefficients and intercepts for logistic regression with multinomial family print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix)) print("Multinomial intercepts: " + str(mlrModel.interceptVector)) # Index labels, adding metadata to the label column.
def main(context): """Main function takes a Spark SQL context.""" # TASK 1 # Code for task 1... comments_DF = None submissions_DF = None labeled_data_DF = None comments_parquet = os.path.abspath("./comments-minimal.parquet") submissions_parquet = os.path.abspath("./submissions.parquet") labeled_data_parquet = os.path.abspath("./labeled_data.parquet") if (os.path.exists(labeled_data_parquet)): labeled_data_DF = context.read.parquet(labeled_data_parquet) else: labeled_data_DF = context.read.csv("labeled_data.csv", header=True) labeled_data_DF.write.parquet(labeled_data_parquet) if (os.path.exists(submissions_parquet)): submissions_DF = context.read.parquet(submissions_parquet) else: submissions_DF = context.read.json("submissions.json.bz2") submissions_DF.write.parquet(submissions_parquet) if (os.path.exists(comments_parquet)): comments_DF = context.read.parquet(comments_parquet) else: comments_DF = context.read.json("comments-minimal.json.bz2") comments_DF.write.parquet(comments_parquet) # TASK 2 # Code for task 2... labeled_data_DF.createOrReplaceTempView("labeled_data") comments_DF.createOrReplaceTempView("comments") labeled_comments = context.sql( "select comments.id, cast(labeled_data.labeldjt as int) as label, body, author, author_flair_text, link_id, score, created_utc from labeled_data inner join comments on comments.id = labeled_data.Input_id" ) #labeled_comments.select("id", "Input_id").show() #labeled_comments.show() # TASK 4, 5 # Code for tasks 4 and 5 context.udf.register( "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(), sanitize(body)[1:], []), ArrayType(StringType())) labeled_comments.createOrReplaceTempView("labeled_comments") combined = context.sql( "select *, sanitize(body) as words from labeled_comments") #combined.printSchema() #combined.select("body", "words").show() # TASK 6A # Code for task 6A... cv = CountVectorizer(inputCol="words", outputCol="features", minDF=5.0, binary=True, vocabSize=1 << 18) vectorize_model = cv.fit(combined) vectorized = vectorize_model.transform(combined) vectorize_model.write().overwrite().save("www/vector.model") # TASK 6B # Code for task 6B... vectorized.createOrReplaceTempView("vectorized") labeled = context.sql( "select *, case when label = 1 then 1 else 0 end as poslabel, case when label = -1 then 1 else 0 end as neglabel from vectorized" ) #labeled.show() # TASK 7 # Code for task 7... pos = labeled neg = labeled # Bunch of imports (may need more) from pyspark.ml.classification import LogisticRegression from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder from pyspark.ml.evaluation import BinaryClassificationEvaluator posmodel_path = os.path.abspath("www/pos.model") negmodel_path = os.path.abspath("www/neg.model") # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="poslabel", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="neglabel", featuresCol="features", maxIter=10) poslr.setThreshold(0.2) neglr.setThreshold(0.25) #we set threshold here to avoid doing extra sql queries at the end # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) posModel = None negModel = None # Train the models if (os.path.exists(posmodel_path)): posModel = CrossValidatorModel.load(posmodel_path) else: print("Training positive classifier...") posModel = posCrossval.fit(posTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.write().overwrite().save(posmodel_path) if (os.path.exists(negmodel_path)): negModel = CrossValidatorModel.load(negmodel_path) else: print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. negModel.write().overwrite().save(negmodel_path) # TEST MODEL posResult = posModel.transform(posTest) posResult.createOrReplaceTempView("posResult") posAccuracy = context.sql( "select avg(case when poslabel = prediction then 1 else 0 end) as accuracy from posResult" ) #posAccuracy.show() negResult = negModel.transform(negTest) negResult.createOrReplaceTempView("negResult") negAccuracy = context.sql( "select avg(case when neglabel = prediction then 1 else 0 end) as accuracy from negResult" ) #negAccuracy.show() # PLOT ROC CURVE from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric results = posResult.select(['probability', 'poslabel']) results_collect = results.collect() results_list = [(float(i[0][0]), 1.0 - float(i[1])) for i in results_collect] scoreAndLabels = sc.parallelize(results_list) metrics = metric(scoreAndLabels) from sklearn.metrics import roc_curve, auc fpr = dict() tpr = dict() roc_auc = dict() y_test = [i[1] for i in results_list] y_score = [i[0] for i in results_list] fpr, tpr, _ = roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, 'g--', label='Trump Positive Sentiment, ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") results = negResult.select(['probability', 'neglabel']) results_collect = results.collect() results_list = [(float(i[0][0]), 1.0 - float(i[1])) for i in results_collect] scoreAndLabels = sc.parallelize(results_list) metrics = metric(scoreAndLabels) fpr = dict() tpr = dict() roc_auc = dict() y_test = [i[1] for i in results_list] y_score = [i[0] for i in results_list] fpr, tpr, _ = roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, 'r--', label='Trump Negative Sentiment, ROC curve (area = %0.2f)' % roc_auc) plt.legend(loc="lower right") plt.savefig('trump_ROC.png') # TASK 8 # Code for task 8... #get title of submission post submissions_DF.createOrReplaceTempView("submissions") comments_DF.createOrReplaceTempView("comments") whole_data = context.sql( "select s.id as submission_id, s.title, s.author_cakeday, s.created_utc, s.author_flair_text, s.over_18, c.controversiality, c.body as body, c.id as comment_id, c.score as comment_score, s.score as story_score from comments c inner join submissions s on s.id = SUBSTR(c.link_id, 4, LENGTH(c.link_id) - 3) where body not like '%/s' and body not like '>%'" ) whole_data.show(20) sampled = whole_data.sample(False, 0.5, 42) #sampled.show(20) #whole_data.count() #sampled.count() # TASK 9 # Code for task 9... context.udf.register( "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(), sanitize(body)[1:], []), ArrayType(StringType())) sampled.createOrReplaceTempView("sampled") combined = context.sql("select *, sanitize(body) as words from sampled") combined.printSchema() combined = combined.select("sampled.comment_id", "sampled.submission_id", "sampled.title", "sampled.created_utc", "sampled.author_flair_text", "sampled.author_cakeday", "sampled.over_18", "sampled.controversiality", "sampled.body", "words", "sampled.comment_score", "sampled.story_score") #combined.show() vectorized = vectorize_model.transform(combined) vectorized.show() posResult = posModel.transform(vectorized) posResult = posResult.withColumnRenamed( 'prediction', 'pos').drop("rawPrediction").drop("probability") result = negModel.transform(posResult) result = result.withColumnRenamed( 'prediction', 'neg').drop("rawPrediction").drop("probability") temp = result temp = temp.drop("body", "words", "features") result = result.drop("body", "words", "features", "title") #result.show() # TASK 10 # Code for task 10... result.createOrReplaceTempView("result") #number 1 totalrows = result.count() PosPercentage = result.filter(result.pos == 1.0).count() * 100 / totalrows NegPercentage = result.filter(result.neg == 1.0).count() * 100 / totalrows print("Positive Percentage: {}%".format(PosPercentage)) print("Negative Percentage: {}%".format(NegPercentage)) #number 2 #https://medium.com/@mrpowers/working-with-dates-and-times-in-spark-491a9747a1d2 with_time = result.withColumn( "date", F.from_unixtime(functions.col('created_utc')).cast(DateType())) with_time_pos = with_time.groupBy("date").agg( functions.sum(result.pos) / functions.count(result.pos)) with_time_neg = with_time.groupBy("date").agg( functions.sum(result.neg) / functions.count(result.neg)) time_data = with_time_pos.join(with_time_neg, ["date"]) time_data.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("time_data.csv") #number 3 states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', \ 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', \ 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', \ 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', \ 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'] statelist = context.createDataFrame( states, StringType() ) #create data frame so we can join the two tables together, eliminate any non-state author_flair_text #https://stackoverflow.com/questions/40421356/how-to-do-left-outer-join-in-spark-sql #https://docs.databricks.com/spark/latest/faq/join-two-dataframes-duplicated-column.html #we found that the attribute name for statelist dataframe was "value" by using printSchema() new_pos_states = pos_states.join(statelist, ["author_flair_text"], "inner") statelist = statelist.withColumnRenamed("value", "state") result = result.withColumnRenamed("author_flair_text", "state") pos_states = result.groupBy("state").agg( functions.sum(result.pos) / functions.count(result.pos)) new_pos_states = pos_states.join(statelist, ["state"], "inner") new_pos_states = new_pos_states.withColumnRenamed( "(sum(pos) / count(pos))", "Positive") #new_pos_states = new_pos_states.withColumnRenamed("author_flair_text", "state") neg_states = result.groupBy("state").agg( functions.sum(result.neg) / functions.count(result.neg)) new_neg_states = neg_states.join(statelist, ["state"], "inner") new_neg_states = new_neg_states.withColumnRenamed( "(sum(neg) / count(neg))", "Negative") #new_neg_states = new_neg_states.withColumnRenamed("author_flair_text", "state") #tried doing left_outer initially, but not all author flair text is a state, so need to do inner instead state_data = new_pos_states.join(new_neg_states, ["state"], "inner") state_data.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("state_data.csv") #final deliverable number 4 commentPos = result.groupBy("comment_score").agg( functions.sum(result.pos) / functions.count( result.pos)) #for some reason scalar values don't work??? storyPos = result.groupBy("story_score").agg( functions.sum(result.pos) / functions.count(result.pos)) commentNeg = result.groupBy("comment_score").agg( functions.sum(result.neg) / functions.count(result.neg)) storyNeg = result.groupBy("story_score").agg( functions.sum(result.neg) / functions.count(result.neg)) comment_data = commentPos.join(commentNeg, ["comment_score"]) submission_data = storyPos.join(storyNeg, ["story_score"]) comment_data.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save("comment_data.csv") submission_data.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save("submission_data.csv") #http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html #Final Deliverable part 4 temp.createOrReplaceTempView("temp") top_pos = context.sql( "select title, (sum(pos) / count(pos)) as Positive from temp group by title order by Positive desc limit 10" ) top_neg = context.sql( "select title, (sum(neg) / count(neg)) as Negative from temp group by title order by Negative desc limit 10" )
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # $example on$ # Extract the summary from the returned LogisticRegressionModel instance trained # in the earlier example trainingSummary = lrModel.summary # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory print("objectiveHistory:") for objective in objectiveHistory: print(objective) # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. trainingSummary.roc.show() print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) # Set the model threshold to maximize F-Measure fMeasure = trainingSummary.fMeasureByThreshold maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) # $example off$ spark.stop()
print("objectiveHistory:") for objective in objectiveHistory: print(objective) # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. trainingSummary.roc.show() print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) # Set the model threshold to maximize F-Measure fMeasure = trainingSummary.fMeasureByThreshold maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() bestThreshold = fMeasure.where( fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']).select( 'threshold').head()['threshold'] logistic.setThreshold(bestThreshold) print('best threshold is:' + str(bestThreshold)) print("For Logistic regression:") trained_model = logistic.fit(train) res = trained_model.transform(test) metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd) print('Accuracy on test set: ', evaluator.evaluate(res)) print('Area under ROC curve: ', eval.evaluate(res)) # find_performance_metrics(res, "logistic regression") find_performance_metrics(res, "logistic regression with best threshold") df = pd.DataFrame({ 'lr_coeff': trained_model.coefficients, 'feature_column': feature_columns, })
def main(context): """Main function takes a Spark SQL context.""" # TASK 1 # the read is from the parquet file comments = sqlContext.read.parquet("comments-minimal.parquet") submissions = sqlContext.read.parquet("submissions.parquet") # only look at columns that are useful comments = comments.select("id","created_utc","body","author_flair_text", "link_id", "score").\ withColumnRenamed("score", "commentscore") submissions = submissions.select("id", "title", "score").\ withColumnRenamed("score", "storyscore") #comments.write.parquet("comments-minimal.parquet") #submissions.write.parquet("submissions.parquet") # TASK 2 labeled_data = sqlContext.read.format("csv").options( header='true', inferSchema='true').load('labeled_data.csv') #here we do the join on comment id joined = comments.join(labeled_data, comments.id == labeled_data.Input_id) #comments.join(labeled_data, comments.id == labeled_data.Input_id).explain() # TASK 4 #sanitize_new ignores processed string given by sanitize from cleantext import sanitize def sanitize_new(text): r = sanitize(text)[1:] return r[0].split(" ") + r[1].split(" ") + r[2].split(" ") # TASK 5 #create the udf, generate new column of n-grams sanitize_udf = udf(sanitize_new, ArrayType(StringType())) joined = joined.withColumn("ngrams", sanitize_udf(joined.body)) # TASK 6A # construct feature vector based on "ngrams" #store the transformed column in "features" #CountVectroizer produces sparse vector by default so no need to change cv = CountVectorizer(inputCol="ngrams", outputCol="features", minDF=5.0, binary=True) cv_model = cv.fit(joined) joined = cv_model.transform(joined) # TASK 6B # construct pos column and neg column #for this project, only look at label on Trump pos_udf = udf(lambda label: 1 if label == 1 else 0, IntegerType()) neg_udf = udf(lambda label: 1 if label == -1 else 0, IntegerType()) joined = joined.withColumn("poslabel", pos_udf(joined.labeldjt)) joined = joined.withColumn("neglabel", neg_udf(joined.labeldjt)) # TASK 7 #train logistic regression model #code adopted from project spec #Initialize two logistic regression models. poslr = LogisticRegression(labelCol="poslabel", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="neglabel", featuresCol="features", maxIter=10) poslr.setThreshold(0.2) neglr.setThreshold(0.25) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel") negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel") # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = joined.randomSplit([0.5, 0.5]) negTrain, negTest = joined.randomSplit([0.5, 0.5]) # Train the models posModel = posCrossval.fit(posTrain) negModel = negCrossval.fit(negTrain) # TASK: Extra Credit's Curve: # evaluate the model # posTestRes = posModel.transform(posTest).toPandas()['probability'] # posTestRes = np.array([i[1] for i in posTestRes]) # negTestRes = negModel.transform(negTest).toPandas()['probability'] # negTestRes = np.array([i[1] for i in negTestRes]) # print(negTestRes, posTestRes) # print('ok') # pfpr, ptpr, _ = metrics.roc_curve(posTest.select('poslabel').toPandas(), posTestRes) # nfpr, ntpr, _ = metrics.roc_curve(negTest.select('neglabel').toPandas(), negTestRes) # print(pfpr[:5], ptpr[:5], nfpr[:5],ntpr[:5]) # plt.plot(pfpr, ptpr, label = 'posModel') # plt.plot(nfpr, ntpr, label = 'negModel') # plt.legend() # plt.savefig('ROC.png') # plt.close() # # save the models # posModel.save("www/pos.model") # negModel.save("www/neg.model") #load instead # posModel = CrossValidatorModel.load("www/pos.model") # negModel = CrossValidatorModel.load("www/neg.model") #print("finished loading model") # TASK 8.1 # selected column 'created_utc' and transformed in 10.2 using from_unixtime # TASK 8.2 # title of submission of the comment comments = comments.withColumn("clean_id", regexp_replace("link_id", r'^t3_', '')) comments = comments.join( submissions, comments.clean_id == submissions.id).drop(submissions.id) # TASK 8.3 # Please see TASK 10.3 (by state) line 166 # TASK 9 #filter out comments with "\s" and starts with ">" comments = comments.filter(~comments.body.rlike(r'^>')).\ filter(~comments.body.rlike(r'\\s')) #sample comments = comments.sample( False, sampleRate, None) # 1 serves as the seed so model is reproducible #redo 4,5,6a comments = comments.withColumn("ngrams", sanitize_udf(comments.body)) comments = cv_model.transform(comments) #print("done with transforming the sampled comments") #make predictions comments = posModel.transform(comments).\ drop("body", "link_id", "clean_id", "ngrams","rawPrediction", "probability").\ withColumnRenamed("prediction", "poslabel") comments = negModel.transform(comments).drop("features", "rawPrediction", "probability").\ withColumnRenamed("prediction", "neglabel") # TASK 10.1 # compute the percentage of positive, negative comments #print("Percentage of positive comments") result = comments.select('poslabel').groupBy().avg() result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("pos-perc.csv") #print("Percenetage of negative comments") result = comments.select('neglabel').groupBy().avg() result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("neg-perc.csv") # TASK 10.2 #2. by date comments = comments.withColumn( "date", from_unixtime(comments.created_utc, "YYYY-MM-dd")) result = comments.groupBy("date").agg({ "poslabel": "mean", "neglabel": "mean" }) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("time_data.csv") # TASK 10.3 #3. by state val_state_udf = udf(lambda state: state if state in states else None, StringType()) comments = comments.withColumn( "state", val_state_udf(lower(comments.author_flair_text))) comments = comments.filter(comments.state.isNotNull()) result = comments.groupBy("state").agg({ "poslabel": "mean", "neglabel": "mean" }) result.show(truncate=False) #print(result.count()) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("state_data.csv") # TASK 10.4 #4a. by comment score result = comments.groupBy("commentscore").agg({ "poslabel": "mean", "neglabel": "mean" }) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("comment_score.csv") #4b. by story score result = comments.groupBy("storyscore").agg({ "poslabel": "mean", "neglabel": "mean" }) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("story_score.csv") # DELIVERABLE 4. story = result.orderBy('avg(poslabel)', ascending=False).limit(10) # join is too expensive, subquery is also expensive score_list = set(story.select('storyscore').toPandas()['storyscore']) comments[comments.storyscore.isin(score_list)].select( 'storyscore', 'title').limit(20).show(truncate=False) story = result.orderBy('avg(neglabel)', ascending=False).limit(10) score_list = set(story.select('storyscore').toPandas()['storyscore']) comments[comments.storyscore.isin(score_list)].select( 'storyscore', 'title').limit(20).show(truncate=False)
def Logistic_regression(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";") dataset.show() dataset.groupBy("y").count().show() label = '' for y in label_colm: label = y f = "" f = label + " ~ " for x in feature_colm: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) finalized_data = output.select("features", "label") finalized_data.show() train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) Accuracy_list = [] FPR_list = [] TPR_list = [] precision_list = [] recall_list = [] lr = LogisticRegression(maxIter=5) lrModel = lr.fit(train_data) print("coefficients:" + str(lrModel.coefficientMatrix)) print("intercept: " + str(lrModel.interceptVector)) training_summary = lrModel.summary BinaryLogisticRegressionTrainingSummary.accuracy print(" area under roc : ", training_summary.areaUnderROC) print(" roc : ", training_summary.roc) roc = training_summary.roc roc.show() roc.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/ROC_plot.parquet', mode='overwrite') print(" pr value : ", training_summary.pr) pr = training_summary.pr pr.show() pr.write.parquet( 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/PR_plot.parquet', mode='overwrite') print(" precision by threshold : ", training_summary.precisionByThreshold) prec_by_threshold = training_summary.precisionByThreshold prec_by_threshold.show() print(" accuracy : ", training_summary.accuracy) accuracy_d = training_summary.accuracy print(accuracy_d) fMeasure = training_summary.fMeasureByThreshold fMeasure.show() maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) objectiveHistory = training_summary.objectiveHistory print("objectiveHistory") for objective in objectiveHistory: print(objective) print("false positive rate by label:") for i, rate in enumerate(training_summary.falsePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("True positive rate") for i, rate in enumerate(training_summary.truePositiveRateByLabel): print("label %d : %s" % (i, rate)) print("Precision by label:") for i, prec in enumerate(training_summary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(training_summary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(training_summary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = training_summary.accuracy falsePositiveRate = training_summary.weightedFalsePositiveRate truePositiveRate = training_summary.weightedTruePositiveRate fMeasure = training_summary.weightedFMeasure() precision = training_summary.weightedPrecision recall = training_summary.weightedRecall print( "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) Accuracy_list.append(accuracy) FPR_list.append(falsePositiveRate) TPR_list.append(truePositiveRate) precision_list.append(precision) recall_list.append(recall) print(Accuracy_list) print(FPR_list) print(TPR_list) print(precision_list) print(recall_list) fpr = roc.select("FPR").toPandas() tpr = roc.select("TPR").toPandas() plt.plot(fpr, tpr) plt.show() pr_recall = pr.select("recall").toPandas() pr_precision = pr.select("precision").toPandas() plt.plot(pr_precision, pr_recall) plt.show() prediction_val = lrModel.transform(test_data) prediction_val.groupBy("label", "prediction").count().show() prediction_val.show() prediction_val.groupBy("prediction").count().show() prediction_val.groupBy("prediction", "probability").count().show()
def Logistic_regression(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";") dataset.show() dataset.groupBy("y").count().show() label = '' for y in label_colm: label = y print(label) # using the rformula for indexing, encoding and vectorising # f = "" # f = label + " ~ " # # for x in features: # f = f + x + "+" # f = f[:-1] # f = (f) # extracting the schema val = dataset.schema string_features = [] integer_features = [] for x in val: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: string_features.append(x.name) else: for y in feature_colm: if x.name == y: integer_features.append(x.name) print(string_features) print(integer_features) print(val) # print(label) # label = 'y' for z in val: if (z.name == label and str(z.dataType) == "StringType"): label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) if (z.name == label and str(z.dataType) == ("IntegerType" or "FloatType" or "DoubleType")): dataset = dataset.withColumnRenamed(label, 'indexed_' + label) ########################################################################### indexed_features = [] encoded_features = [] for col in string_features: indexer = StringIndexer(inputCol=col, outputCol='indexed_' + col).fit(dataset) indexed_features.append('indexed_' + col) dataset = indexer.transform(dataset) # dataset.show() # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset) # encoded_features.append('encoded_'+col) # dataset = encoder.transform(dataset) # dataset.show() print(indexed_features) print(encoded_features) # combining both the features colm together final_features = integer_features + indexed_features print(final_features) # now using the vector assembler featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() # combining both the features colm together # output.show() # output.select("features").show() # output_features = dataset.select("features") # using the vector indexer (for categorical data kind of one hot encoding) vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=15).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() # preparing the finalized data finalized_data = vec_indexed.select('indexed_' + label, 'vec_indexed_features') finalized_data.show() # formula = RFormula(formula=f, # featuresCol="features", # labelCol="label") # # output = formula.fit(dataset).transform(dataset) # # output_2 = output.select("features", "label") # # output_2.show() # splitting the dataset into train and test train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) # implementing the logistic regression # lr1 =LogisticRegression() Accuracy_list = [] # Accuracy_list.append(accuracy) FPR_list = [] # FPR_list.append(falsePositiveRate) TPR_list = [] precision_list = [] recall_list = [] y = 0.1 # x=[] for i in range(0, 3): y = round(y + 0.1, 2) lr = LogisticRegression(featuresCol='vec_indexed_features', labelCol='indexed_' + label, maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3) # fit the model lrModel = lr.fit(train_data) lrModel # print the coefficients and the intercept for the logistic regression print("coefficients:" + str(lrModel.coefficientMatrix)) # mat = (lrModel.coefficientMatrix) # print mat print("intercept: " + str(lrModel.interceptVector)) # getting the summary of the model # f-measure calculation from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary training_summary = lrModel.summary BinaryLogisticRegressionTrainingSummary.accuracy print(" area under roc : ", training_summary.areaUnderROC) print(" roc : ", training_summary.roc) roc = training_summary.roc roc.show() print(" pr value : ", training_summary.pr) pr = training_summary.pr pr.show() print(" precision by threshold : ", training_summary.precisionByThreshold) prec_by_threshold = training_summary.precisionByThreshold prec_by_threshold.show() print(" accuracy : ", training_summary.accuracy) accuracy_d = training_summary.accuracy print(accuracy_d) fMeasure = training_summary.fMeasureByThreshold fMeasure.show() maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) # obtain the objective per iteration objectiveHistory = training_summary.objectiveHistory print("objectiveHistory") for objective in objectiveHistory: print(objective) # for a multiclass we can inspect a matrix on a per label basis print("false positive rate by label:") for i, rate in enumerate( training_summary.falsePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("True positive rate") for i, rate in enumerate(training_summary.truePositiveRateByLabel): print("label %d : %s" % (i, rate)) # # print("True Negative rate") # for i, rate in enumerate(training_summary) print("Precision by label:") for i, prec in enumerate(training_summary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(training_summary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(training_summary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = training_summary.accuracy falsePositiveRate = training_summary.weightedFalsePositiveRate truePositiveRate = training_summary.weightedTruePositiveRate fMeasure = training_summary.weightedFMeasure() precision = training_summary.weightedPrecision recall = training_summary.weightedRecall print( "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) # Accuracy_list = [] Accuracy_list.append(accuracy) # FPR_list = [] FPR_list.append(falsePositiveRate) # TPR_list=[] TPR_list.append(truePositiveRate) precision_list.append(precision) recall_list.append(recall) print(Accuracy_list) print(FPR_list) print(TPR_list) print(precision_list) print(recall_list) import matplotlib.pyplot as plt # # plt.plot(recall_list, FPR_list) # plt.show() # # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ] # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395 , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0] # data visualization # ROC graph fpr = roc.select("FPR").toPandas() tpr = roc.select("TPR").toPandas() plt.plot(fpr, tpr) plt.show() # PR graph pr_recall = pr.select("recall").toPandas() pr_precision = pr.select("precision").toPandas() plt.plot(pr_precision, pr_recall) plt.show() # now applying the fit on the test data prediction_val = lrModel.transform(test_data) prediction_val.groupBy('indexed_' + label, "prediction").count().show() prediction_val.show() prediction_val.groupBy("prediction").count().show() prediction_val.groupBy("prediction", "probability").count().show()
def Logistic_regression(dataset_add, features, label): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";") dataset.show() dataset.groupBy("y").count().show() # using the rformula for indexing, encoding and vectorising f = "" f = label + " ~ " for x in features: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") output = formula.fit(dataset).transform(dataset) output_2 = output.select("features", "label") output_2.show() # splitting the dataset into train and test train_data, test_data = output_2.randomSplit([0.75, 0.25], seed = 40) # implementing the logistic regression lr1 =LogisticRegression() Accuracy_list = [] # Accuracy_list.append(accuracy) FPR_list = [] # FPR_list.append(falsePositiveRate) TPR_list = [] precision_list = [] recall_list = [] y= 0.1 # x=[] for i in range(0,3): y=round(y+0.1,2) lr = LogisticRegression(maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3) # fit the model lrModel = lr.fit(train_data) lrModel # print the coefficients and the intercept for the logistic regression print ("coefficients:" + str(lrModel.coefficientMatrix)) # mat = (lrModel.coefficientMatrix) # print mat print("intercept: " + str(lrModel.interceptVector)) # getting the summary of the model # f-measure calculation from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary training_summary = lrModel.summary BinaryLogisticRegressionTrainingSummary.accuracy print (" area under roc : " , training_summary.areaUnderROC) print (" roc : " , training_summary.roc) roc = training_summary.roc roc.show() print (" pr value : " , training_summary.pr) pr = training_summary.pr pr.show() print (" precision by threshold : " , training_summary.precisionByThreshold) prec_by_threshold = training_summary.precisionByThreshold prec_by_threshold.show() print (" accuracy : ", training_summary.accuracy) accuracy_d = training_summary.accuracy print (accuracy_d) fMeasure = training_summary.fMeasureByThreshold fMeasure.show() maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head() bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \ .select('threshold').head()['threshold'] lr.setThreshold(bestThreshold) # obtain the objective per iteration objectiveHistory = training_summary.objectiveHistory print ("objectiveHistory") for objective in objectiveHistory: print (objective) # for a multiclass we can inspect a matrix on a per label basis print ("false positive rate by label:") for i, rate in enumerate(training_summary.falsePositiveRateByLabel): print ("label %d: %s" % (i, rate)) print("True positive rate") for i, rate in enumerate(training_summary.truePositiveRateByLabel): print ("label %d : %s" % (i, rate)) # # print("True Negative rate") # for i, rate in enumerate(training_summary) print("Precision by label:") for i, prec in enumerate(training_summary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(training_summary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(training_summary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = training_summary.accuracy falsePositiveRate = training_summary.weightedFalsePositiveRate truePositiveRate = training_summary.weightedTruePositiveRate fMeasure = training_summary.weightedFMeasure() precision = training_summary.weightedPrecision recall = training_summary.weightedRecall print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) # Accuracy_list = [] Accuracy_list.append(accuracy) # FPR_list = [] FPR_list.append(falsePositiveRate) # TPR_list=[] TPR_list.append(truePositiveRate) precision_list.append(precision) recall_list.append(recall) print (Accuracy_list) print (FPR_list) print (TPR_list) print (precision_list) print (recall_list) import matplotlib.pyplot as plt # # plt.plot(recall_list, FPR_list) # plt.show() # # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ] # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395 , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0] # data visualization # ROC graph fpr = roc.select("FPR").toPandas() tpr = roc.select("TPR").toPandas() plt.plot(fpr, tpr) plt.show() # PR graph pr_recall = pr.select("recall").toPandas() pr_precision = pr.select("precision").toPandas() plt.plot(pr_precision,pr_recall) plt.show() # now applying the fit on the test data prediction_val = lrModel.transform(test_data) prediction_val.groupBy("label", "prediction").count().show() prediction_val.show() prediction_val.groupBy("prediction").count().show() prediction_val.groupBy("prediction", "probability").count().show()
# set best threshold maxFMeasure = fMeasure.groupBy().max('F-Measure').select( 'max(F-Measure)').head()['max(F-Measure)'] print "maxFMeasure" print maxFMeasure fMeasure_new = spark.createDataFrame(fMeasure.rdd, ['threshold', 'fmeasure']) bestThreshold = fMeasure_new.where( maxFMeasure - fMeasure_new.fmeasure < 0.00001).select('threshold').head() print "bestThreshold" print bestThreshold print "original threshold" print lr.getThreshold() #lr.setThreshold(bestThreshold['threshold']) lr.setThreshold(-3) print "new threshold" print lr.getThreshold() #lr.setThreshold(bestThreshold) # $example off$ #test result_all = lrModel.transform(training) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") res = evaluator.evaluate(result_all) print "evaluator res:" print res res = evaluator.evaluate(result_all, {evaluator.metricName: "areaUnderPR"}) print "evaluator pr res:" print res