filter("Flag is null").\ select(positive.Id, positive.Text, positive.Label) testing = negativeTest.unionAll(positiveTest) # CREATE MODEL numFeatures = 64000 numEpochs = 30 regParam = 0.02 tokenizer = Tokenizer().setInputCol("Text").setOutputCol("Words") #tokenized = tokenizer.transform(training) hashingTF = HashingTF().setNumFeatures(numFeatures).\ setInputCol("Words").setOutputCol("Features") lr = LogisticRegression().setMaxIter(numEpochs).setRegParam(regParam).\ setFeaturesCol("Features").setLabelCol("Label").\ setRawPredictionCol("Score").setPredictionCol("Prediction") pipeline = Pipeline().setStages([tokenizer, hashingTF, lr]) model = pipeline.fit(training) testingResult = model.transform(testing) testingResultScores = testingResult.select("Prediction", "Label").rdd.map( lambda r: (float(r[0]), float(r[1]))) bc = BinaryClassificationMetrics(testingResultScores) print bc newbc = bc.call('roc') print newbc.take(20) #roc = bc.areaUnderROC() #print("Area under the ROC:", roc)
feats = [float(line['cc_avg']), float(line['pp_avg'])] return LabeledPoint(float(line['label']), feats) mobile_lp = mobile_data.map(lambda x: create_label_point(x)) mobile_lp_f = mobile_lp_f.filter(lambda line: line is not None) training, test = mobile_lp_f.randomSplit([0.8, 0.2], seed=11) training.cache() model = LogisticRegressionWithLBFGS.train(training, iterations=500, regParam=0.01, regType='l2') labelsAndPreds = training.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float( training.count()) print("Training Error = " + str(trainErr)) predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) testErr = predictionAndLabels.filter( lambda lp: lp[0] != lp[1]).count() / float(test.count()) print("Testing Error = " + str(testErr)) # Instantiate metrics object metrics = BinaryClassificationMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC)