コード例 #1
0
                        filter("Flag is null").\
                        select(positive.Id, positive.Text, positive.Label)
testing = negativeTest.unionAll(positiveTest)
 
 
# CREATE MODEL
numFeatures = 64000
numEpochs = 30
regParam = 0.02
 
tokenizer = Tokenizer().setInputCol("Text").setOutputCol("Words")
#tokenized = tokenizer.transform(training)
hashingTF = HashingTF().setNumFeatures(numFeatures).\
                setInputCol("Words").setOutputCol("Features")
lr = LogisticRegression().setMaxIter(numEpochs).setRegParam(regParam).\
                                    setFeaturesCol("Features").setLabelCol("Label").\
                                    setRawPredictionCol("Score").setPredictionCol("Prediction")
pipeline = Pipeline().setStages([tokenizer, hashingTF, lr])
 

model = pipeline.fit(training) 
 
testingResult = model.transform(testing)
testingResultScores = testingResult.select("Prediction", "Label").rdd.map( lambda r: (float(r[0]), float(r[1])))
bc = BinaryClassificationMetrics(testingResultScores)
print bc

newbc = bc.call('roc')
print newbc.take(20)
#roc = bc.areaUnderROC()
#print("Area under the ROC:",  roc)
コード例 #2
0
    feats = [float(line['cc_avg']), float(line['pp_avg'])]
    return LabeledPoint(float(line['label']), feats)


mobile_lp = mobile_data.map(lambda x: create_label_point(x))
mobile_lp_f = mobile_lp_f.filter(lambda line: line is not None)

training, test = mobile_lp_f.randomSplit([0.8, 0.2], seed=11)
training.cache()

model = LogisticRegressionWithLBFGS.train(training,
                                          iterations=500,
                                          regParam=0.01,
                                          regType='l2')
labelsAndPreds = training.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(
    training.count())
print("Training Error = " + str(trainErr))

predictionAndLabels = test.map(lambda lp:
                               (float(model.predict(lp.features)), lp.label))
testErr = predictionAndLabels.filter(
    lambda lp: lp[0] != lp[1]).count() / float(test.count())
print("Testing Error = " + str(testErr))
# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)
# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)
# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)