Python LogisticRegression.setThreshold Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark.ml.classification

Klasse / Typ: LogisticRegression

Methode / Funktion: setThreshold

Beispiele auf hotexamples.com: 15

Python LogisticRegression.setThreshold - 15 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.ml.classification.LogisticRegression.setThreshold, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

fit(30)

LogisticRegression(30)

transform(15)

setThreshold(12)

explainParams(11)

evaluate(5)

setLabelCol(4)

setElasticNetParam(4)

getOrDefault(4)

save(4)

load(3)

setMaxIter(3)

write(3)

getFeaturesCol(3)

extractParamMap(3)

setFeaturesCol(2)

explainParam(2)

getTol(1)

elasticNetParam(1)

isSet(1)

getThreshold(1)

maxiter(1)

predict(1)

getParam(1)

getMaxIter(1)

getLabelCol(1)

setParams(1)

setRegParam(1)

hasDefault(1)

Beispiel #1

Datei anzeigen

def regression(df, column, name):
    try:
        model = CrossValidatorModel.load("data/{}.model".format(name))
    except:

        LR = LogisticRegression(labelCol="label",
                                featuresCol="features",
                                maxIter=10)
        if name[3] == 'P':
            LR.setThreshold(0.2)
        else:
            LR.setThreshold(0.25)

        eval = BinaryClassificationEvaluator()
        paramGrid = ParamGridBuilder().addGrid(LR.regParam, [1.0]).build()
        crossval = CrossValidator(estimator=LR,
                                  evaluator=eval,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=5)
        # train, test = df.select('features', func.col(column).alias("label")).randomSplit([0.5, 0.5])
        print("Training '{}' classifier... Please wait".format(name))

        model = crossval.fit(df.select("*", func.col(column).alias("label")))
        model.save("data/{}.model".format(name))
    # df_test = model.transform(df)
    # df_test.filter(df_test.prediction == 1).show()
    return model

Beispiel #2

Datei anzeigen

Datei: test_persistence.py Projekt: zhengruifeng/spark

    def test_default_read_write(self):
        temp_path = tempfile.mkdtemp()

        lr = LogisticRegression()
        lr.setMaxIter(50)
        lr.setThreshold(0.75)
        writer = DefaultParamsWriter(lr)

        savePath = temp_path + "/lr"
        writer.save(savePath)

        reader = DefaultParamsReadable.read()
        lr2 = reader.load(savePath)

        self.assertEqual(lr.uid, lr2.uid)
        self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())

        # test overwrite
        lr.setThreshold(0.8)
        writer.overwrite().save(savePath)

        reader = DefaultParamsReadable.read()
        lr3 = reader.load(savePath)

        self.assertEqual(lr.uid, lr3.uid)
        self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())

Beispiel #3

Datei anzeigen

Datei: test_persistence.py Projekt: JingchengDu/spark

    def test_default_read_write(self):
        temp_path = tempfile.mkdtemp()

        lr = LogisticRegression()
        lr.setMaxIter(50)
        lr.setThreshold(.75)
        writer = DefaultParamsWriter(lr)

        savePath = temp_path + "/lr"
        writer.save(savePath)

        reader = DefaultParamsReadable.read()
        lr2 = reader.load(savePath)

        self.assertEqual(lr.uid, lr2.uid)
        self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())

        # test overwrite
        lr.setThreshold(.8)
        writer.overwrite().save(savePath)

        reader = DefaultParamsReadable.read()
        lr3 = reader.load(savePath)

        self.assertEqual(lr.uid, lr3.uid)
        self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())

Beispiel #4

Datei anzeigen

    def test_default_read_write_default_params(self):
        lr = LogisticRegression()
        self.assertFalse(lr.isSet(lr.getParam("threshold")))

        lr.setMaxIter(50)
        lr.setThreshold(0.75)

        # `threshold` is set by user, default param `predictionCol` is not set by user.
        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        writer = DefaultParamsWriter(lr)
        metadata = json.loads(writer._get_metadata_to_save(lr, self.sc))
        self.assertTrue("defaultParamMap" in metadata)

        reader = DefaultParamsReadable.read()
        metadataStr = json.dumps(metadata, separators=[",", ":"])
        loadedMetadata = reader._parseMetaData(
            metadataStr,
        )
        reader.getAndSetParams(lr, loadedMetadata)

        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        # manually create metadata without `defaultParamMap` section.
        del metadata["defaultParamMap"]
        metadataStr = json.dumps(metadata, separators=[",", ":"])
        loadedMetadata = reader._parseMetaData(
            metadataStr,
        )
        with self.assertRaisesRegex(AssertionError, "`defaultParamMap` section not found"):
            reader.getAndSetParams(lr, loadedMetadata)

        # Prior to 2.4.0, metadata doesn't have `defaultParamMap`.
        metadata["sparkVersion"] = "2.3.0"
        metadataStr = json.dumps(metadata, separators=[",", ":"])
        loadedMetadata = reader._parseMetaData(
            metadataStr,
        )
        reader.getAndSetParams(lr, loadedMetadata)

Beispiel #5

Datei anzeigen

Datei: test_persistence.py Projekt: JingchengDu/spark

    def test_default_read_write_default_params(self):
        lr = LogisticRegression()
        self.assertFalse(lr.isSet(lr.getParam("threshold")))

        lr.setMaxIter(50)
        lr.setThreshold(.75)

        # `threshold` is set by user, default param `predictionCol` is not set by user.
        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        writer = DefaultParamsWriter(lr)
        metadata = json.loads(writer._get_metadata_to_save(lr, self.sc))
        self.assertTrue("defaultParamMap" in metadata)

        reader = DefaultParamsReadable.read()
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        reader.getAndSetParams(lr, loadedMetadata)

        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        # manually create metadata without `defaultParamMap` section.
        del metadata['defaultParamMap']
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"):
            reader.getAndSetParams(lr, loadedMetadata)

        # Prior to 2.4.0, metadata doesn't have `defaultParamMap`.
        metadata['sparkVersion'] = '2.3.0'
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        reader.getAndSetParams(lr, loadedMetadata)

Beispiel #6

Datei anzeigen

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ####Logistic Regression - Train

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# set threshold for the probability above which to predict a 1
lr.setThreshold(training_data_positive_rate)
# lr.setThreshold(0.5) # could use this if knew you had balanced data

# Train model with Training Data
lrModel = lr.fit(trainingData)

# get training summary used for eval metrics and other params
lrTrainingSummary = lrModel.summary

# Find the best model threshold if you would like to use that instead of the empirical positve rate
fMeasure = lrTrainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()
lrBestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']

Beispiel #7

Datei anzeigen

objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10,
                         regParam=0.3,
                         elasticNetParam=0.8,
                         family="multinomial")

# Fit the model
mlrModel = mlr.fit(left_join)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

# Index labels, adding metadata to the label column.

Beispiel #8

Datei anzeigen

def main(context):
    """Main function takes a Spark SQL context."""

    # TASK 1
    # Code for task 1...
    comments_DF = None
    submissions_DF = None
    labeled_data_DF = None

    comments_parquet = os.path.abspath("./comments-minimal.parquet")
    submissions_parquet = os.path.abspath("./submissions.parquet")
    labeled_data_parquet = os.path.abspath("./labeled_data.parquet")

    if (os.path.exists(labeled_data_parquet)):
        labeled_data_DF = context.read.parquet(labeled_data_parquet)
    else:
        labeled_data_DF = context.read.csv("labeled_data.csv", header=True)
        labeled_data_DF.write.parquet(labeled_data_parquet)

    if (os.path.exists(submissions_parquet)):
        submissions_DF = context.read.parquet(submissions_parquet)
    else:
        submissions_DF = context.read.json("submissions.json.bz2")
        submissions_DF.write.parquet(submissions_parquet)

    if (os.path.exists(comments_parquet)):
        comments_DF = context.read.parquet(comments_parquet)
    else:
        comments_DF = context.read.json("comments-minimal.json.bz2")
        comments_DF.write.parquet(comments_parquet)

    # TASK 2
    # Code for task 2...

    labeled_data_DF.createOrReplaceTempView("labeled_data")
    comments_DF.createOrReplaceTempView("comments")
    labeled_comments = context.sql(
        "select comments.id, cast(labeled_data.labeldjt as int) as label, body, author, author_flair_text, link_id, score, created_utc from labeled_data inner join comments on comments.id = labeled_data.Input_id"
    )
    #labeled_comments.select("id", "Input_id").show()
    #labeled_comments.show()

    # TASK 4, 5
    # Code for tasks 4 and 5
    context.udf.register(
        "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(),
                                        sanitize(body)[1:], []),
        ArrayType(StringType()))
    labeled_comments.createOrReplaceTempView("labeled_comments")
    combined = context.sql(
        "select *, sanitize(body) as words from labeled_comments")

    #combined.printSchema()
    #combined.select("body", "words").show()

    # TASK 6A
    # Code for task 6A...
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         minDF=5.0,
                         binary=True,
                         vocabSize=1 << 18)

    vectorize_model = cv.fit(combined)
    vectorized = vectorize_model.transform(combined)
    vectorize_model.write().overwrite().save("www/vector.model")

    # TASK 6B
    # Code for task 6B...
    vectorized.createOrReplaceTempView("vectorized")
    labeled = context.sql(
        "select *, case when label = 1 then 1 else 0 end as poslabel, case when label = -1 then 1 else 0 end as neglabel from vectorized"
    )
    #labeled.show()

    # TASK 7
    # Code for task 7...
    pos = labeled
    neg = labeled

    # Bunch of imports (may need more)
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
    from pyspark.ml.evaluation import BinaryClassificationEvaluator

    posmodel_path = os.path.abspath("www/pos.model")
    negmodel_path = os.path.abspath("www/neg.model")

    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="poslabel",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="neglabel",
                               featuresCol="features",
                               maxIter=10)
    poslr.setThreshold(0.2)
    neglr.setThreshold(0.25)

    #we set threshold here to avoid doing extra sql queries at the end

    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])

    posModel = None
    negModel = None

    # Train the models
    if (os.path.exists(posmodel_path)):
        posModel = CrossValidatorModel.load(posmodel_path)
    else:
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.write().overwrite().save(posmodel_path)

    if (os.path.exists(negmodel_path)):
        negModel = CrossValidatorModel.load(negmodel_path)
    else:
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        negModel.write().overwrite().save(negmodel_path)

    # TEST MODEL
    posResult = posModel.transform(posTest)
    posResult.createOrReplaceTempView("posResult")
    posAccuracy = context.sql(
        "select avg(case when poslabel = prediction then 1 else 0 end) as accuracy from posResult"
    )
    #posAccuracy.show()

    negResult = negModel.transform(negTest)
    negResult.createOrReplaceTempView("negResult")
    negAccuracy = context.sql(
        "select avg(case when neglabel = prediction then 1 else 0 end) as accuracy from negResult"
    )
    #negAccuracy.show()

    # PLOT ROC CURVE
    from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric

    results = posResult.select(['probability', 'poslabel'])
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)
    metrics = metric(scoreAndLabels)

    from sklearn.metrics import roc_curve, auc
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr,
             tpr,
             'g--',
             label='Trump Positive Sentiment, ROC curve (area = %0.2f)' %
             roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")

    results = negResult.select(['probability', 'neglabel'])
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)
    metrics = metric(scoreAndLabels)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr,
             tpr,
             'r--',
             label='Trump Negative Sentiment, ROC curve (area = %0.2f)' %
             roc_auc)
    plt.legend(loc="lower right")
    plt.savefig('trump_ROC.png')

    # TASK 8
    # Code for task 8...
    #get title of submission post
    submissions_DF.createOrReplaceTempView("submissions")
    comments_DF.createOrReplaceTempView("comments")
    whole_data = context.sql(
        "select s.id as submission_id, s.title, s.author_cakeday, s.created_utc, s.author_flair_text, s.over_18, c.controversiality, c.body as body, c.id as comment_id, c.score as comment_score, s.score as story_score from comments c inner join submissions s on s.id = SUBSTR(c.link_id, 4, LENGTH(c.link_id) - 3) where body not like '%/s' and body not like '&gt%'"
    )
    whole_data.show(20)
    sampled = whole_data.sample(False, 0.5, 42)
    #sampled.show(20)

    #whole_data.count()
    #sampled.count()

    # TASK 9
    # Code for task 9...
    context.udf.register(
        "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(),
                                        sanitize(body)[1:], []),
        ArrayType(StringType()))
    sampled.createOrReplaceTempView("sampled")
    combined = context.sql("select *, sanitize(body) as words from sampled")

    combined.printSchema()
    combined = combined.select("sampled.comment_id", "sampled.submission_id",
                               "sampled.title", "sampled.created_utc",
                               "sampled.author_flair_text",
                               "sampled.author_cakeday", "sampled.over_18",
                               "sampled.controversiality", "sampled.body",
                               "words", "sampled.comment_score",
                               "sampled.story_score")
    #combined.show()

    vectorized = vectorize_model.transform(combined)
    vectorized.show()

    posResult = posModel.transform(vectorized)
    posResult = posResult.withColumnRenamed(
        'prediction', 'pos').drop("rawPrediction").drop("probability")
    result = negModel.transform(posResult)
    result = result.withColumnRenamed(
        'prediction', 'neg').drop("rawPrediction").drop("probability")

    temp = result
    temp = temp.drop("body", "words", "features")
    result = result.drop("body", "words", "features", "title")
    #result.show()

    # TASK 10
    # Code for task 10...
    result.createOrReplaceTempView("result")

    #number 1
    totalrows = result.count()
    PosPercentage = result.filter(result.pos == 1.0).count() * 100 / totalrows
    NegPercentage = result.filter(result.neg == 1.0).count() * 100 / totalrows

    print("Positive Percentage: {}%".format(PosPercentage))
    print("Negative Percentage: {}%".format(NegPercentage))

    #number 2
    #https://medium.com/@mrpowers/working-with-dates-and-times-in-spark-491a9747a1d2
    with_time = result.withColumn(
        "date",
        F.from_unixtime(functions.col('created_utc')).cast(DateType()))
    with_time_pos = with_time.groupBy("date").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    with_time_neg = with_time.groupBy("date").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    time_data = with_time_pos.join(with_time_neg, ["date"])

    time_data.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("time_data.csv")

    #number 3
    states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', \
    'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', \
    'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', \
    'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', \
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

    statelist = context.createDataFrame(
        states, StringType()
    )  #create data frame so we can join the two tables together, eliminate any non-state author_flair_text

    #https://stackoverflow.com/questions/40421356/how-to-do-left-outer-join-in-spark-sql
    #https://docs.databricks.com/spark/latest/faq/join-two-dataframes-duplicated-column.html
    #we found that the attribute name for statelist dataframe was "value" by using printSchema()
    new_pos_states = pos_states.join(statelist, ["author_flair_text"], "inner")

    statelist = statelist.withColumnRenamed("value", "state")
    result = result.withColumnRenamed("author_flair_text", "state")
    pos_states = result.groupBy("state").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    new_pos_states = pos_states.join(statelist, ["state"], "inner")
    new_pos_states = new_pos_states.withColumnRenamed(
        "(sum(pos) / count(pos))", "Positive")
    #new_pos_states = new_pos_states.withColumnRenamed("author_flair_text", "state")
    neg_states = result.groupBy("state").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    new_neg_states = neg_states.join(statelist, ["state"], "inner")
    new_neg_states = new_neg_states.withColumnRenamed(
        "(sum(neg) / count(neg))", "Negative")
    #new_neg_states = new_neg_states.withColumnRenamed("author_flair_text", "state")
    #tried doing left_outer initially, but not all author flair text is a state, so need to do inner instead

    state_data = new_pos_states.join(new_neg_states, ["state"], "inner")

    state_data.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("state_data.csv")

    #final deliverable number 4
    commentPos = result.groupBy("comment_score").agg(
        functions.sum(result.pos) / functions.count(
            result.pos))  #for some reason scalar values don't work???
    storyPos = result.groupBy("story_score").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    commentNeg = result.groupBy("comment_score").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    storyNeg = result.groupBy("story_score").agg(
        functions.sum(result.neg) / functions.count(result.neg))

    comment_data = commentPos.join(commentNeg, ["comment_score"])
    submission_data = storyPos.join(storyNeg, ["story_score"])

    comment_data.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("comment_data.csv")
    submission_data.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("submission_data.csv")

    #http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html
    #Final Deliverable part 4
    temp.createOrReplaceTempView("temp")
    top_pos = context.sql(
        "select title, (sum(pos) / count(pos)) as Positive from temp group by title order by Positive desc limit 10"
    )
    top_neg = context.sql(
        "select title, (sum(neg) / count(neg)) as Negative from temp group by title order by Negative desc limit 10"
    )

Beispiel #9

Datei anzeigen

Datei: logistic_regression_summary_example.py Projekt: lhfei/spark-in-action

    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(training)

    # $example on$
    # Extract the summary from the returned LogisticRegressionModel instance trained
    # in the earlier example
    trainingSummary = lrModel.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)

    # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
    trainingSummary.roc.show()
    print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

    # Set the model threshold to maximize F-Measure
    fMeasure = trainingSummary.fMeasureByThreshold
    maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
    bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
        .select('threshold').head()['threshold']
    lr.setThreshold(bestThreshold)
    # $example off$

    spark.stop()

Beispiel #10

Datei anzeigen

Datei: rf_and_logistic.py Projekt: chiu/accounting-ml-project

print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()
bestThreshold = fMeasure.where(
    fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']).select(
        'threshold').head()['threshold']
logistic.setThreshold(bestThreshold)
print('best threshold is:' + str(bestThreshold))

print("For Logistic regression:")
trained_model = logistic.fit(train)
res = trained_model.transform(test)
metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd)
print('Accuracy on test set: ', evaluator.evaluate(res))
print('Area under ROC curve: ', eval.evaluate(res))
# find_performance_metrics(res, "logistic regression")
find_performance_metrics(res, "logistic regression with best threshold")

df = pd.DataFrame({
    'lr_coeff': trained_model.coefficients,
    'feature_column': feature_columns,
})

Beispiel #11

Datei anzeigen

Datei: reddit_model.py Projekt: kiwiiwarmnfuzzy/Project2B

def main(context):
    """Main function takes a Spark SQL context."""
    # TASK 1
    # the read is from the parquet file
    comments = sqlContext.read.parquet("comments-minimal.parquet")
    submissions = sqlContext.read.parquet("submissions.parquet")

    # only look at columns that are useful
    comments = comments.select("id","created_utc","body","author_flair_text", "link_id", "score").\
        withColumnRenamed("score", "commentscore")
    submissions = submissions.select("id", "title", "score").\
        withColumnRenamed("score", "storyscore")

    #comments.write.parquet("comments-minimal.parquet")
    #submissions.write.parquet("submissions.parquet")

    # TASK 2
    labeled_data = sqlContext.read.format("csv").options(
        header='true', inferSchema='true').load('labeled_data.csv')

    #here we do the join on comment id
    joined = comments.join(labeled_data, comments.id == labeled_data.Input_id)
    #comments.join(labeled_data, comments.id == labeled_data.Input_id).explain()

    # TASK 4
    #sanitize_new ignores processed string given by sanitize
    from cleantext import sanitize

    def sanitize_new(text):
        r = sanitize(text)[1:]
        return r[0].split(" ") + r[1].split(" ") + r[2].split(" ")

    # TASK 5
    #create the udf, generate new column of n-grams
    sanitize_udf = udf(sanitize_new, ArrayType(StringType()))
    joined = joined.withColumn("ngrams", sanitize_udf(joined.body))

    # TASK 6A
    # construct feature vector based on "ngrams"
    #store the transformed column in "features"
    #CountVectroizer produces sparse vector by default so no need to change
    cv = CountVectorizer(inputCol="ngrams",
                         outputCol="features",
                         minDF=5.0,
                         binary=True)
    cv_model = cv.fit(joined)
    joined = cv_model.transform(joined)

    # TASK 6B
    # construct pos column and neg column
    #for this project, only look at label on Trump
    pos_udf = udf(lambda label: 1 if label == 1 else 0, IntegerType())
    neg_udf = udf(lambda label: 1 if label == -1 else 0, IntegerType())
    joined = joined.withColumn("poslabel", pos_udf(joined.labeldjt))
    joined = joined.withColumn("neglabel", neg_udf(joined.labeldjt))

    # TASK 7
    #train logistic regression model
    #code adopted from project spec
    #Initialize two logistic regression models.
    poslr = LogisticRegression(labelCol="poslabel",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="neglabel",
                               featuresCol="features",
                               maxIter=10)
    poslr.setThreshold(0.2)
    neglr.setThreshold(0.25)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel")
    negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel")
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = joined.randomSplit([0.5, 0.5])
    negTrain, negTest = joined.randomSplit([0.5, 0.5])

    # Train the models
    posModel = posCrossval.fit(posTrain)
    negModel = negCrossval.fit(negTrain)

    # TASK: Extra Credit's Curve:
    # evaluate the model
    #     posTestRes = posModel.transform(posTest).toPandas()['probability']
    #     posTestRes = np.array([i[1] for i in posTestRes])
    #     negTestRes = negModel.transform(negTest).toPandas()['probability']
    #     negTestRes = np.array([i[1] for i in negTestRes])
    #     print(negTestRes, posTestRes)
    #     print('ok')
    #     pfpr, ptpr, _ = metrics.roc_curve(posTest.select('poslabel').toPandas(), posTestRes)
    #     nfpr, ntpr, _ = metrics.roc_curve(negTest.select('neglabel').toPandas(), negTestRes)
    #     print(pfpr[:5], ptpr[:5], nfpr[:5],ntpr[:5])
    #     plt.plot(pfpr, ptpr, label = 'posModel')
    #     plt.plot(nfpr, ntpr, label = 'negModel')
    #     plt.legend()
    #     plt.savefig('ROC.png')
    #     plt.close()
    #     # save the models
    #     posModel.save("www/pos.model")
    #     negModel.save("www/neg.model")

    #load instead
    #     posModel = CrossValidatorModel.load("www/pos.model")
    #     negModel = CrossValidatorModel.load("www/neg.model")
    #print("finished loading model")

    # TASK 8.1
    # selected column 'created_utc' and transformed in 10.2 using from_unixtime

    # TASK 8.2
    # title of submission of the comment
    comments = comments.withColumn("clean_id",
                                   regexp_replace("link_id", r'^t3_', ''))
    comments = comments.join(
        submissions, comments.clean_id == submissions.id).drop(submissions.id)

    # TASK 8.3
    # Please see TASK 10.3 (by state) line 166

    # TASK 9
    #filter out comments with "\s" and starts with "&gt"
    comments = comments.filter(~comments.body.rlike(r'^&gt')).\
        filter(~comments.body.rlike(r'\\s'))
    #sample
    comments = comments.sample(
        False, sampleRate,
        None)  # 1 serves as the seed so model is reproducible
    #redo 4,5,6a
    comments = comments.withColumn("ngrams", sanitize_udf(comments.body))
    comments = cv_model.transform(comments)
    #print("done with transforming the sampled comments")

    #make predictions
    comments = posModel.transform(comments).\
        drop("body", "link_id", "clean_id", "ngrams","rawPrediction", "probability").\
        withColumnRenamed("prediction", "poslabel")
    comments = negModel.transform(comments).drop("features", "rawPrediction", "probability").\
        withColumnRenamed("prediction", "neglabel")

    # TASK 10.1
    # compute the percentage of positive, negative comments
    #print("Percentage of positive comments")
    result = comments.select('poslabel').groupBy().avg()
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("pos-perc.csv")
    #print("Percenetage of negative comments")
    result = comments.select('neglabel').groupBy().avg()
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("neg-perc.csv")

    # TASK 10.2
    #2. by date
    comments = comments.withColumn(
        "date", from_unixtime(comments.created_utc, "YYYY-MM-dd"))
    result = comments.groupBy("date").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("time_data.csv")

    # TASK 10.3
    #3. by state
    val_state_udf = udf(lambda state: state if state in states else None,
                        StringType())
    comments = comments.withColumn(
        "state", val_state_udf(lower(comments.author_flair_text)))
    comments = comments.filter(comments.state.isNotNull())
    result = comments.groupBy("state").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.show(truncate=False)
    #print(result.count())
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("state_data.csv")

    # TASK 10.4
    #4a. by comment score
    result = comments.groupBy("commentscore").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("comment_score.csv")
    #4b. by story score
    result = comments.groupBy("storyscore").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("story_score.csv")

    # DELIVERABLE 4.
    story = result.orderBy('avg(poslabel)', ascending=False).limit(10)
    # join is too expensive, subquery is also expensive
    score_list = set(story.select('storyscore').toPandas()['storyscore'])
    comments[comments.storyscore.isin(score_list)].select(
        'storyscore', 'title').limit(20).show(truncate=False)

    story = result.orderBy('avg(neglabel)', ascending=False).limit(10)
    score_list = set(story.select('storyscore').toPandas()['storyscore'])
    comments[comments.storyscore.isin(score_list)].select(
        'storyscore', 'title').limit(20).show(truncate=False)

Beispiel #12

Datei anzeigen

    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()

        label = ''
        for y in label_colm:
            label = y

        f = ""
        f = label + " ~ "

        for x in feature_colm:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f, featuresCol="features", labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        finalized_data = output.select("features", "label")

        finalized_data.show()

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        Accuracy_list = []

        FPR_list = []
        TPR_list = []
        precision_list = []
        recall_list = []
        lr = LogisticRegression(maxIter=5)
        lrModel = lr.fit(train_data)

        print("coefficients:" + str(lrModel.coefficientMatrix))
        print("intercept: " + str(lrModel.interceptVector))
        training_summary = lrModel.summary
        BinaryLogisticRegressionTrainingSummary.accuracy
        print(" area under roc : ", training_summary.areaUnderROC)
        print("  roc : ", training_summary.roc)
        roc = training_summary.roc
        roc.show()
        roc.write.parquet(
            'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/ROC_plot.parquet',
            mode='overwrite')
        print(" pr value : ", training_summary.pr)
        pr = training_summary.pr
        pr.show()
        pr.write.parquet(
            'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/PR_plot.parquet',
            mode='overwrite')
        print(" precision by threshold : ",
              training_summary.precisionByThreshold)
        prec_by_threshold = training_summary.precisionByThreshold
        prec_by_threshold.show()
        print(" accuracy : ", training_summary.accuracy)
        accuracy_d = training_summary.accuracy
        print(accuracy_d)
        fMeasure = training_summary.fMeasureByThreshold
        fMeasure.show()
        maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
            'max(F-Measure)').head()
        bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
            .select('threshold').head()['threshold']
        lr.setThreshold(bestThreshold)
        objectiveHistory = training_summary.objectiveHistory
        print("objectiveHistory")
        for objective in objectiveHistory:
            print(objective)
        print("false positive rate by label:")
        for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
            print("label %d: %s" % (i, rate))
        print("True positive rate")
        for i, rate in enumerate(training_summary.truePositiveRateByLabel):
            print("label %d : %s" % (i, rate))
        print("Precision by label:")
        for i, prec in enumerate(training_summary.precisionByLabel):
            print("label %d: %s" % (i, prec))
        print("Recall by label:")
        for i, rec in enumerate(training_summary.recallByLabel):
            print("label %d: %s" % (i, rec))
        print("F-measure by label:")
        for i, f in enumerate(training_summary.fMeasureByLabel()):
            print("label %d: %s" % (i, f))
        accuracy = training_summary.accuracy
        falsePositiveRate = training_summary.weightedFalsePositiveRate
        truePositiveRate = training_summary.weightedTruePositiveRate
        fMeasure = training_summary.weightedFMeasure()
        precision = training_summary.weightedPrecision
        recall = training_summary.weightedRecall
        print(
            "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
            % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
               precision, recall))
        Accuracy_list.append(accuracy)
        FPR_list.append(falsePositiveRate)
        TPR_list.append(truePositiveRate)
        precision_list.append(precision)
        recall_list.append(recall)
        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)
        fpr = roc.select("FPR").toPandas()
        tpr = roc.select("TPR").toPandas()
        plt.plot(fpr, tpr)
        plt.show()
        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()
        plt.plot(pr_precision, pr_recall)
        plt.show()
        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy("label", "prediction").count().show()
        prediction_val.show()
        prediction_val.groupBy("prediction").count().show()
        prediction_val.groupBy("prediction", "probability").count().show()

Beispiel #13

Datei anzeigen

    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()
        label = ''
        for y in label_colm:
            label = y

        print(label)

        # using the rformula for indexing, encoding and vectorising

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)

        # extracting the schema

        val = dataset.schema

        string_features = []
        integer_features = []

        for x in val:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        string_features.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        integer_features.append(x.name)

        print(string_features)
        print(integer_features)
        print(val)
        # print(label)
        # label = 'y'

        for z in val:
            if (z.name == label and str(z.dataType) == "StringType"):
                label_indexer = StringIndexer(inputCol=label,
                                              outputCol='indexed_' +
                                              label).fit(dataset)
                dataset = label_indexer.transform(dataset)
            if (z.name == label and str(z.dataType)
                    == ("IntegerType" or "FloatType" or "DoubleType")):
                dataset = dataset.withColumnRenamed(label, 'indexed_' + label)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for col in string_features:
            indexer = StringIndexer(inputCol=col,
                                    outputCol='indexed_' + col).fit(dataset)
            indexed_features.append('indexed_' + col)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset)
            # encoded_features.append('encoded_'+col)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = integer_features + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # combining both the features colm together

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        # using the vector indexer (for categorical data kind of one hot encoding)

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=15).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select('indexed_' + label,
                                            'vec_indexed_features')
        finalized_data.show()

        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        # implementing the logistic regression
        # lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y = 0.1
        # x=[]
        for i in range(0, 3):
            y = round(y + 0.1, 2)

            lr = LogisticRegression(featuresCol='vec_indexed_features',
                                    labelCol='indexed_' + label,
                                    maxIter=5,
                                    regParam=0.1,
                                    elasticNetParam=1.0,
                                    threshold=0.3)

            # fit the model

            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))

            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print(" area under roc : ", training_summary.areaUnderROC)
            print("  roc : ", training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print(" pr value : ", training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print(" precision by threshold : ",
                  training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print(" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print(accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
                'max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print("objectiveHistory")
            for objective in objectiveHistory:
                print(objective)

            # for a multiclass we can inspect  a matrix on a per label basis

            print("false positive rate by label:")
            for i, rate in enumerate(
                    training_summary.falsePositiveRateByLabel):
                print("label %d: %s" % (i, rate))

            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print(
                "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
                   precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()

        plt.plot(fpr, tpr)
        plt.show()

        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision, pr_recall)
        plt.show()

        # now applying the fit on the test data

        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy('indexed_' + label, "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()

Beispiel #14

Datei anzeigen

    def Logistic_regression(dataset_add, features, label):

        dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()

        # using the rformula for indexing, encoding and vectorising

        f = ""
        f = label + " ~ "

        for x in features:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f,
                           featuresCol="features",
                           labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        output_2 = output.select("features", "label")

        output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = output_2.randomSplit([0.75, 0.25], seed = 40)

        # implementing the logistic regression
        lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y= 0.1
        # x=[]
        for i in range(0,3):
            y=round(y+0.1,2)

            lr = LogisticRegression(maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3)



            # fit the model


            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print ("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))





            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print (" area under roc : " , training_summary.areaUnderROC)
            print ("  roc : " , training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print (" pr value : " , training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print (" precision by threshold : " , training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print (" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print (accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print ("objectiveHistory")
            for objective in objectiveHistory:
                print (objective)


            # for a multiclass we can inspect  a matrix on a per label basis

            print ("false positive rate by label:")
            for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
                print ("label %d: %s" % (i, rate))


            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print ("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                  % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print (Accuracy_list)
        print (FPR_list)
        print (TPR_list)
        print (precision_list)
        print (recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()


        plt.plot(fpr, tpr)
        plt.show()


        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision,pr_recall)
        plt.show()


        # now applying the fit on the test data


        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy("label", "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()

Beispiel #15

Datei anzeigen

# set best threshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()['max(F-Measure)']
print "maxFMeasure"
print maxFMeasure
fMeasure_new = spark.createDataFrame(fMeasure.rdd, ['threshold', 'fmeasure'])
bestThreshold = fMeasure_new.where(
    maxFMeasure - fMeasure_new.fmeasure < 0.00001).select('threshold').head()
print "bestThreshold"
print bestThreshold
print "original threshold"
print lr.getThreshold()

#lr.setThreshold(bestThreshold['threshold'])
lr.setThreshold(-3)
print "new threshold"
print lr.getThreshold()

#lr.setThreshold(bestThreshold)
# $example off$
#test
result_all = lrModel.transform(training)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
res = evaluator.evaluate(result_all)
print "evaluator res:"
print res
res = evaluator.evaluate(result_all, {evaluator.metricName: "areaUnderPR"})
print "evaluator pr res:"
print res