Beispiel #1
0
def regression(df, column, name):
    try:
        model = CrossValidatorModel.load("data/{}.model".format(name))
    except:

        LR = LogisticRegression(labelCol="label",
                                featuresCol="features",
                                maxIter=10)
        if name[3] == 'P':
            LR.setThreshold(0.2)
        else:
            LR.setThreshold(0.25)

        eval = BinaryClassificationEvaluator()
        paramGrid = ParamGridBuilder().addGrid(LR.regParam, [1.0]).build()
        crossval = CrossValidator(estimator=LR,
                                  evaluator=eval,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=5)
        # train, test = df.select('features', func.col(column).alias("label")).randomSplit([0.5, 0.5])
        print("Training '{}' classifier... Please wait".format(name))

        model = crossval.fit(df.select("*", func.col(column).alias("label")))
        model.save("data/{}.model".format(name))
    # df_test = model.transform(df)
    # df_test.filter(df_test.prediction == 1).show()
    return model
Beispiel #2
0
    def test_default_read_write(self):
        temp_path = tempfile.mkdtemp()

        lr = LogisticRegression()
        lr.setMaxIter(50)
        lr.setThreshold(0.75)
        writer = DefaultParamsWriter(lr)

        savePath = temp_path + "/lr"
        writer.save(savePath)

        reader = DefaultParamsReadable.read()
        lr2 = reader.load(savePath)

        self.assertEqual(lr.uid, lr2.uid)
        self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())

        # test overwrite
        lr.setThreshold(0.8)
        writer.overwrite().save(savePath)

        reader = DefaultParamsReadable.read()
        lr3 = reader.load(savePath)

        self.assertEqual(lr.uid, lr3.uid)
        self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
Beispiel #3
0
    def test_default_read_write(self):
        temp_path = tempfile.mkdtemp()

        lr = LogisticRegression()
        lr.setMaxIter(50)
        lr.setThreshold(.75)
        writer = DefaultParamsWriter(lr)

        savePath = temp_path + "/lr"
        writer.save(savePath)

        reader = DefaultParamsReadable.read()
        lr2 = reader.load(savePath)

        self.assertEqual(lr.uid, lr2.uid)
        self.assertEqual(lr.extractParamMap(), lr2.extractParamMap())

        # test overwrite
        lr.setThreshold(.8)
        writer.overwrite().save(savePath)

        reader = DefaultParamsReadable.read()
        lr3 = reader.load(savePath)

        self.assertEqual(lr.uid, lr3.uid)
        self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
Beispiel #4
0
    def test_default_read_write_default_params(self):
        lr = LogisticRegression()
        self.assertFalse(lr.isSet(lr.getParam("threshold")))

        lr.setMaxIter(50)
        lr.setThreshold(0.75)

        # `threshold` is set by user, default param `predictionCol` is not set by user.
        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        writer = DefaultParamsWriter(lr)
        metadata = json.loads(writer._get_metadata_to_save(lr, self.sc))
        self.assertTrue("defaultParamMap" in metadata)

        reader = DefaultParamsReadable.read()
        metadataStr = json.dumps(metadata, separators=[",", ":"])
        loadedMetadata = reader._parseMetaData(
            metadataStr,
        )
        reader.getAndSetParams(lr, loadedMetadata)

        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        # manually create metadata without `defaultParamMap` section.
        del metadata["defaultParamMap"]
        metadataStr = json.dumps(metadata, separators=[",", ":"])
        loadedMetadata = reader._parseMetaData(
            metadataStr,
        )
        with self.assertRaisesRegex(AssertionError, "`defaultParamMap` section not found"):
            reader.getAndSetParams(lr, loadedMetadata)

        # Prior to 2.4.0, metadata doesn't have `defaultParamMap`.
        metadata["sparkVersion"] = "2.3.0"
        metadataStr = json.dumps(metadata, separators=[",", ":"])
        loadedMetadata = reader._parseMetaData(
            metadataStr,
        )
        reader.getAndSetParams(lr, loadedMetadata)
Beispiel #5
0
    def test_default_read_write_default_params(self):
        lr = LogisticRegression()
        self.assertFalse(lr.isSet(lr.getParam("threshold")))

        lr.setMaxIter(50)
        lr.setThreshold(.75)

        # `threshold` is set by user, default param `predictionCol` is not set by user.
        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        writer = DefaultParamsWriter(lr)
        metadata = json.loads(writer._get_metadata_to_save(lr, self.sc))
        self.assertTrue("defaultParamMap" in metadata)

        reader = DefaultParamsReadable.read()
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        reader.getAndSetParams(lr, loadedMetadata)

        self.assertTrue(lr.isSet(lr.getParam("threshold")))
        self.assertFalse(lr.isSet(lr.getParam("predictionCol")))
        self.assertTrue(lr.hasDefault(lr.getParam("predictionCol")))

        # manually create metadata without `defaultParamMap` section.
        del metadata['defaultParamMap']
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"):
            reader.getAndSetParams(lr, loadedMetadata)

        # Prior to 2.4.0, metadata doesn't have `defaultParamMap`.
        metadata['sparkVersion'] = '2.3.0'
        metadataStr = json.dumps(metadata, separators=[',',  ':'])
        loadedMetadata = reader._parseMetaData(metadataStr, )
        reader.getAndSetParams(lr, loadedMetadata)
Beispiel #6
0
# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ####Logistic Regression - Train

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# set threshold for the probability above which to predict a 1
lr.setThreshold(training_data_positive_rate)
# lr.setThreshold(0.5) # could use this if knew you had balanced data

# Train model with Training Data
lrModel = lr.fit(trainingData)

# get training summary used for eval metrics and other params
lrTrainingSummary = lrModel.summary

# Find the best model threshold if you would like to use that instead of the empirical positve rate
fMeasure = lrTrainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()
lrBestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
Beispiel #7
0
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10,
                         regParam=0.3,
                         elasticNetParam=0.8,
                         family="multinomial")

# Fit the model
mlrModel = mlr.fit(left_join)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

# Index labels, adding metadata to the label column.
Beispiel #8
0
def main(context):
    """Main function takes a Spark SQL context."""

    # TASK 1
    # Code for task 1...
    comments_DF = None
    submissions_DF = None
    labeled_data_DF = None

    comments_parquet = os.path.abspath("./comments-minimal.parquet")
    submissions_parquet = os.path.abspath("./submissions.parquet")
    labeled_data_parquet = os.path.abspath("./labeled_data.parquet")

    if (os.path.exists(labeled_data_parquet)):
        labeled_data_DF = context.read.parquet(labeled_data_parquet)
    else:
        labeled_data_DF = context.read.csv("labeled_data.csv", header=True)
        labeled_data_DF.write.parquet(labeled_data_parquet)

    if (os.path.exists(submissions_parquet)):
        submissions_DF = context.read.parquet(submissions_parquet)
    else:
        submissions_DF = context.read.json("submissions.json.bz2")
        submissions_DF.write.parquet(submissions_parquet)

    if (os.path.exists(comments_parquet)):
        comments_DF = context.read.parquet(comments_parquet)
    else:
        comments_DF = context.read.json("comments-minimal.json.bz2")
        comments_DF.write.parquet(comments_parquet)

    # TASK 2
    # Code for task 2...

    labeled_data_DF.createOrReplaceTempView("labeled_data")
    comments_DF.createOrReplaceTempView("comments")
    labeled_comments = context.sql(
        "select comments.id, cast(labeled_data.labeldjt as int) as label, body, author, author_flair_text, link_id, score, created_utc from labeled_data inner join comments on comments.id = labeled_data.Input_id"
    )
    #labeled_comments.select("id", "Input_id").show()
    #labeled_comments.show()

    # TASK 4, 5
    # Code for tasks 4 and 5
    context.udf.register(
        "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(),
                                        sanitize(body)[1:], []),
        ArrayType(StringType()))
    labeled_comments.createOrReplaceTempView("labeled_comments")
    combined = context.sql(
        "select *, sanitize(body) as words from labeled_comments")

    #combined.printSchema()
    #combined.select("body", "words").show()

    # TASK 6A
    # Code for task 6A...
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         minDF=5.0,
                         binary=True,
                         vocabSize=1 << 18)

    vectorize_model = cv.fit(combined)
    vectorized = vectorize_model.transform(combined)
    vectorize_model.write().overwrite().save("www/vector.model")

    # TASK 6B
    # Code for task 6B...
    vectorized.createOrReplaceTempView("vectorized")
    labeled = context.sql(
        "select *, case when label = 1 then 1 else 0 end as poslabel, case when label = -1 then 1 else 0 end as neglabel from vectorized"
    )
    #labeled.show()

    # TASK 7
    # Code for task 7...
    pos = labeled
    neg = labeled

    # Bunch of imports (may need more)
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
    from pyspark.ml.evaluation import BinaryClassificationEvaluator

    posmodel_path = os.path.abspath("www/pos.model")
    negmodel_path = os.path.abspath("www/neg.model")

    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="poslabel",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="neglabel",
                               featuresCol="features",
                               maxIter=10)
    poslr.setThreshold(0.2)
    neglr.setThreshold(0.25)

    #we set threshold here to avoid doing extra sql queries at the end

    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])

    posModel = None
    negModel = None

    # Train the models
    if (os.path.exists(posmodel_path)):
        posModel = CrossValidatorModel.load(posmodel_path)
    else:
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.write().overwrite().save(posmodel_path)

    if (os.path.exists(negmodel_path)):
        negModel = CrossValidatorModel.load(negmodel_path)
    else:
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        negModel.write().overwrite().save(negmodel_path)

    # TEST MODEL
    posResult = posModel.transform(posTest)
    posResult.createOrReplaceTempView("posResult")
    posAccuracy = context.sql(
        "select avg(case when poslabel = prediction then 1 else 0 end) as accuracy from posResult"
    )
    #posAccuracy.show()

    negResult = negModel.transform(negTest)
    negResult.createOrReplaceTempView("negResult")
    negAccuracy = context.sql(
        "select avg(case when neglabel = prediction then 1 else 0 end) as accuracy from negResult"
    )
    #negAccuracy.show()

    # PLOT ROC CURVE
    from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric

    results = posResult.select(['probability', 'poslabel'])
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)
    metrics = metric(scoreAndLabels)

    from sklearn.metrics import roc_curve, auc
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr,
             tpr,
             'g--',
             label='Trump Positive Sentiment, ROC curve (area = %0.2f)' %
             roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")

    results = negResult.select(['probability', 'neglabel'])
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)
    metrics = metric(scoreAndLabels)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr,
             tpr,
             'r--',
             label='Trump Negative Sentiment, ROC curve (area = %0.2f)' %
             roc_auc)
    plt.legend(loc="lower right")
    plt.savefig('trump_ROC.png')

    # TASK 8
    # Code for task 8...
    #get title of submission post
    submissions_DF.createOrReplaceTempView("submissions")
    comments_DF.createOrReplaceTempView("comments")
    whole_data = context.sql(
        "select s.id as submission_id, s.title, s.author_cakeday, s.created_utc, s.author_flair_text, s.over_18, c.controversiality, c.body as body, c.id as comment_id, c.score as comment_score, s.score as story_score from comments c inner join submissions s on s.id = SUBSTR(c.link_id, 4, LENGTH(c.link_id) - 3) where body not like '%/s' and body not like '&gt%'"
    )
    whole_data.show(20)
    sampled = whole_data.sample(False, 0.5, 42)
    #sampled.show(20)

    #whole_data.count()
    #sampled.count()

    # TASK 9
    # Code for task 9...
    context.udf.register(
        "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(),
                                        sanitize(body)[1:], []),
        ArrayType(StringType()))
    sampled.createOrReplaceTempView("sampled")
    combined = context.sql("select *, sanitize(body) as words from sampled")

    combined.printSchema()
    combined = combined.select("sampled.comment_id", "sampled.submission_id",
                               "sampled.title", "sampled.created_utc",
                               "sampled.author_flair_text",
                               "sampled.author_cakeday", "sampled.over_18",
                               "sampled.controversiality", "sampled.body",
                               "words", "sampled.comment_score",
                               "sampled.story_score")
    #combined.show()

    vectorized = vectorize_model.transform(combined)
    vectorized.show()

    posResult = posModel.transform(vectorized)
    posResult = posResult.withColumnRenamed(
        'prediction', 'pos').drop("rawPrediction").drop("probability")
    result = negModel.transform(posResult)
    result = result.withColumnRenamed(
        'prediction', 'neg').drop("rawPrediction").drop("probability")

    temp = result
    temp = temp.drop("body", "words", "features")
    result = result.drop("body", "words", "features", "title")
    #result.show()

    # TASK 10
    # Code for task 10...
    result.createOrReplaceTempView("result")

    #number 1
    totalrows = result.count()
    PosPercentage = result.filter(result.pos == 1.0).count() * 100 / totalrows
    NegPercentage = result.filter(result.neg == 1.0).count() * 100 / totalrows

    print("Positive Percentage: {}%".format(PosPercentage))
    print("Negative Percentage: {}%".format(NegPercentage))

    #number 2
    #https://medium.com/@mrpowers/working-with-dates-and-times-in-spark-491a9747a1d2
    with_time = result.withColumn(
        "date",
        F.from_unixtime(functions.col('created_utc')).cast(DateType()))
    with_time_pos = with_time.groupBy("date").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    with_time_neg = with_time.groupBy("date").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    time_data = with_time_pos.join(with_time_neg, ["date"])

    time_data.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("time_data.csv")

    #number 3
    states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', \
    'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', \
    'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', \
    'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', \
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

    statelist = context.createDataFrame(
        states, StringType()
    )  #create data frame so we can join the two tables together, eliminate any non-state author_flair_text

    #https://stackoverflow.com/questions/40421356/how-to-do-left-outer-join-in-spark-sql
    #https://docs.databricks.com/spark/latest/faq/join-two-dataframes-duplicated-column.html
    #we found that the attribute name for statelist dataframe was "value" by using printSchema()
    new_pos_states = pos_states.join(statelist, ["author_flair_text"], "inner")

    statelist = statelist.withColumnRenamed("value", "state")
    result = result.withColumnRenamed("author_flair_text", "state")
    pos_states = result.groupBy("state").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    new_pos_states = pos_states.join(statelist, ["state"], "inner")
    new_pos_states = new_pos_states.withColumnRenamed(
        "(sum(pos) / count(pos))", "Positive")
    #new_pos_states = new_pos_states.withColumnRenamed("author_flair_text", "state")
    neg_states = result.groupBy("state").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    new_neg_states = neg_states.join(statelist, ["state"], "inner")
    new_neg_states = new_neg_states.withColumnRenamed(
        "(sum(neg) / count(neg))", "Negative")
    #new_neg_states = new_neg_states.withColumnRenamed("author_flair_text", "state")
    #tried doing left_outer initially, but not all author flair text is a state, so need to do inner instead

    state_data = new_pos_states.join(new_neg_states, ["state"], "inner")

    state_data.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("state_data.csv")

    #final deliverable number 4
    commentPos = result.groupBy("comment_score").agg(
        functions.sum(result.pos) / functions.count(
            result.pos))  #for some reason scalar values don't work???
    storyPos = result.groupBy("story_score").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    commentNeg = result.groupBy("comment_score").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    storyNeg = result.groupBy("story_score").agg(
        functions.sum(result.neg) / functions.count(result.neg))

    comment_data = commentPos.join(commentNeg, ["comment_score"])
    submission_data = storyPos.join(storyNeg, ["story_score"])

    comment_data.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("comment_data.csv")
    submission_data.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("submission_data.csv")

    #http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html
    #Final Deliverable part 4
    temp.createOrReplaceTempView("temp")
    top_pos = context.sql(
        "select title, (sum(pos) / count(pos)) as Positive from temp group by title order by Positive desc limit 10"
    )
    top_neg = context.sql(
        "select title, (sum(neg) / count(neg)) as Negative from temp group by title order by Negative desc limit 10"
    )
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(training)

    # $example on$
    # Extract the summary from the returned LogisticRegressionModel instance trained
    # in the earlier example
    trainingSummary = lrModel.summary

    # Obtain the objective per iteration
    objectiveHistory = trainingSummary.objectiveHistory
    print("objectiveHistory:")
    for objective in objectiveHistory:
        print(objective)

    # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
    trainingSummary.roc.show()
    print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

    # Set the model threshold to maximize F-Measure
    fMeasure = trainingSummary.fMeasureByThreshold
    maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
    bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
        .select('threshold').head()['threshold']
    lr.setThreshold(bestThreshold)
    # $example off$

    spark.stop()
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()
bestThreshold = fMeasure.where(
    fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']).select(
        'threshold').head()['threshold']
logistic.setThreshold(bestThreshold)
print('best threshold is:' + str(bestThreshold))

print("For Logistic regression:")
trained_model = logistic.fit(train)
res = trained_model.transform(test)
metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd)
print('Accuracy on test set: ', evaluator.evaluate(res))
print('Area under ROC curve: ', eval.evaluate(res))
# find_performance_metrics(res, "logistic regression")
find_performance_metrics(res, "logistic regression with best threshold")

df = pd.DataFrame({
    'lr_coeff': trained_model.coefficients,
    'feature_column': feature_columns,
})
def main(context):
    """Main function takes a Spark SQL context."""
    # TASK 1
    # the read is from the parquet file
    comments = sqlContext.read.parquet("comments-minimal.parquet")
    submissions = sqlContext.read.parquet("submissions.parquet")

    # only look at columns that are useful
    comments = comments.select("id","created_utc","body","author_flair_text", "link_id", "score").\
        withColumnRenamed("score", "commentscore")
    submissions = submissions.select("id", "title", "score").\
        withColumnRenamed("score", "storyscore")

    #comments.write.parquet("comments-minimal.parquet")
    #submissions.write.parquet("submissions.parquet")

    # TASK 2
    labeled_data = sqlContext.read.format("csv").options(
        header='true', inferSchema='true').load('labeled_data.csv')

    #here we do the join on comment id
    joined = comments.join(labeled_data, comments.id == labeled_data.Input_id)
    #comments.join(labeled_data, comments.id == labeled_data.Input_id).explain()

    # TASK 4
    #sanitize_new ignores processed string given by sanitize
    from cleantext import sanitize

    def sanitize_new(text):
        r = sanitize(text)[1:]
        return r[0].split(" ") + r[1].split(" ") + r[2].split(" ")

    # TASK 5
    #create the udf, generate new column of n-grams
    sanitize_udf = udf(sanitize_new, ArrayType(StringType()))
    joined = joined.withColumn("ngrams", sanitize_udf(joined.body))

    # TASK 6A
    # construct feature vector based on "ngrams"
    #store the transformed column in "features"
    #CountVectroizer produces sparse vector by default so no need to change
    cv = CountVectorizer(inputCol="ngrams",
                         outputCol="features",
                         minDF=5.0,
                         binary=True)
    cv_model = cv.fit(joined)
    joined = cv_model.transform(joined)

    # TASK 6B
    # construct pos column and neg column
    #for this project, only look at label on Trump
    pos_udf = udf(lambda label: 1 if label == 1 else 0, IntegerType())
    neg_udf = udf(lambda label: 1 if label == -1 else 0, IntegerType())
    joined = joined.withColumn("poslabel", pos_udf(joined.labeldjt))
    joined = joined.withColumn("neglabel", neg_udf(joined.labeldjt))

    # TASK 7
    #train logistic regression model
    #code adopted from project spec
    #Initialize two logistic regression models.
    poslr = LogisticRegression(labelCol="poslabel",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="neglabel",
                               featuresCol="features",
                               maxIter=10)
    poslr.setThreshold(0.2)
    neglr.setThreshold(0.25)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel")
    negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel")
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = joined.randomSplit([0.5, 0.5])
    negTrain, negTest = joined.randomSplit([0.5, 0.5])

    # Train the models
    posModel = posCrossval.fit(posTrain)
    negModel = negCrossval.fit(negTrain)

    # TASK: Extra Credit's Curve:
    # evaluate the model
    #     posTestRes = posModel.transform(posTest).toPandas()['probability']
    #     posTestRes = np.array([i[1] for i in posTestRes])
    #     negTestRes = negModel.transform(negTest).toPandas()['probability']
    #     negTestRes = np.array([i[1] for i in negTestRes])
    #     print(negTestRes, posTestRes)
    #     print('ok')
    #     pfpr, ptpr, _ = metrics.roc_curve(posTest.select('poslabel').toPandas(), posTestRes)
    #     nfpr, ntpr, _ = metrics.roc_curve(negTest.select('neglabel').toPandas(), negTestRes)
    #     print(pfpr[:5], ptpr[:5], nfpr[:5],ntpr[:5])
    #     plt.plot(pfpr, ptpr, label = 'posModel')
    #     plt.plot(nfpr, ntpr, label = 'negModel')
    #     plt.legend()
    #     plt.savefig('ROC.png')
    #     plt.close()
    #     # save the models
    #     posModel.save("www/pos.model")
    #     negModel.save("www/neg.model")

    #load instead
    #     posModel = CrossValidatorModel.load("www/pos.model")
    #     negModel = CrossValidatorModel.load("www/neg.model")
    #print("finished loading model")

    # TASK 8.1
    # selected column 'created_utc' and transformed in 10.2 using from_unixtime

    # TASK 8.2
    # title of submission of the comment
    comments = comments.withColumn("clean_id",
                                   regexp_replace("link_id", r'^t3_', ''))
    comments = comments.join(
        submissions, comments.clean_id == submissions.id).drop(submissions.id)

    # TASK 8.3
    # Please see TASK 10.3 (by state) line 166

    # TASK 9
    #filter out comments with "\s" and starts with "&gt"
    comments = comments.filter(~comments.body.rlike(r'^&gt')).\
        filter(~comments.body.rlike(r'\\s'))
    #sample
    comments = comments.sample(
        False, sampleRate,
        None)  # 1 serves as the seed so model is reproducible
    #redo 4,5,6a
    comments = comments.withColumn("ngrams", sanitize_udf(comments.body))
    comments = cv_model.transform(comments)
    #print("done with transforming the sampled comments")

    #make predictions
    comments = posModel.transform(comments).\
        drop("body", "link_id", "clean_id", "ngrams","rawPrediction", "probability").\
        withColumnRenamed("prediction", "poslabel")
    comments = negModel.transform(comments).drop("features", "rawPrediction", "probability").\
        withColumnRenamed("prediction", "neglabel")

    # TASK 10.1
    # compute the percentage of positive, negative comments
    #print("Percentage of positive comments")
    result = comments.select('poslabel').groupBy().avg()
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("pos-perc.csv")
    #print("Percenetage of negative comments")
    result = comments.select('neglabel').groupBy().avg()
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("neg-perc.csv")

    # TASK 10.2
    #2. by date
    comments = comments.withColumn(
        "date", from_unixtime(comments.created_utc, "YYYY-MM-dd"))
    result = comments.groupBy("date").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("time_data.csv")

    # TASK 10.3
    #3. by state
    val_state_udf = udf(lambda state: state if state in states else None,
                        StringType())
    comments = comments.withColumn(
        "state", val_state_udf(lower(comments.author_flair_text)))
    comments = comments.filter(comments.state.isNotNull())
    result = comments.groupBy("state").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.show(truncate=False)
    #print(result.count())
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("state_data.csv")

    # TASK 10.4
    #4a. by comment score
    result = comments.groupBy("commentscore").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("comment_score.csv")
    #4b. by story score
    result = comments.groupBy("storyscore").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("story_score.csv")

    # DELIVERABLE 4.
    story = result.orderBy('avg(poslabel)', ascending=False).limit(10)
    # join is too expensive, subquery is also expensive
    score_list = set(story.select('storyscore').toPandas()['storyscore'])
    comments[comments.storyscore.isin(score_list)].select(
        'storyscore', 'title').limit(20).show(truncate=False)

    story = result.orderBy('avg(neglabel)', ascending=False).limit(10)
    score_list = set(story.select('storyscore').toPandas()['storyscore'])
    comments[comments.storyscore.isin(score_list)].select(
        'storyscore', 'title').limit(20).show(truncate=False)
Beispiel #12
0
    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()

        label = ''
        for y in label_colm:
            label = y

        f = ""
        f = label + " ~ "

        for x in feature_colm:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f, featuresCol="features", labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        finalized_data = output.select("features", "label")

        finalized_data.show()

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        Accuracy_list = []

        FPR_list = []
        TPR_list = []
        precision_list = []
        recall_list = []
        lr = LogisticRegression(maxIter=5)
        lrModel = lr.fit(train_data)

        print("coefficients:" + str(lrModel.coefficientMatrix))
        print("intercept: " + str(lrModel.interceptVector))
        training_summary = lrModel.summary
        BinaryLogisticRegressionTrainingSummary.accuracy
        print(" area under roc : ", training_summary.areaUnderROC)
        print("  roc : ", training_summary.roc)
        roc = training_summary.roc
        roc.show()
        roc.write.parquet(
            'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/ROC_plot.parquet',
            mode='overwrite')
        print(" pr value : ", training_summary.pr)
        pr = training_summary.pr
        pr.show()
        pr.write.parquet(
            'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/PR_plot.parquet',
            mode='overwrite')
        print(" precision by threshold : ",
              training_summary.precisionByThreshold)
        prec_by_threshold = training_summary.precisionByThreshold
        prec_by_threshold.show()
        print(" accuracy : ", training_summary.accuracy)
        accuracy_d = training_summary.accuracy
        print(accuracy_d)
        fMeasure = training_summary.fMeasureByThreshold
        fMeasure.show()
        maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
            'max(F-Measure)').head()
        bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
            .select('threshold').head()['threshold']
        lr.setThreshold(bestThreshold)
        objectiveHistory = training_summary.objectiveHistory
        print("objectiveHistory")
        for objective in objectiveHistory:
            print(objective)
        print("false positive rate by label:")
        for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
            print("label %d: %s" % (i, rate))
        print("True positive rate")
        for i, rate in enumerate(training_summary.truePositiveRateByLabel):
            print("label %d : %s" % (i, rate))
        print("Precision by label:")
        for i, prec in enumerate(training_summary.precisionByLabel):
            print("label %d: %s" % (i, prec))
        print("Recall by label:")
        for i, rec in enumerate(training_summary.recallByLabel):
            print("label %d: %s" % (i, rec))
        print("F-measure by label:")
        for i, f in enumerate(training_summary.fMeasureByLabel()):
            print("label %d: %s" % (i, f))
        accuracy = training_summary.accuracy
        falsePositiveRate = training_summary.weightedFalsePositiveRate
        truePositiveRate = training_summary.weightedTruePositiveRate
        fMeasure = training_summary.weightedFMeasure()
        precision = training_summary.weightedPrecision
        recall = training_summary.weightedRecall
        print(
            "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
            % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
               precision, recall))
        Accuracy_list.append(accuracy)
        FPR_list.append(falsePositiveRate)
        TPR_list.append(truePositiveRate)
        precision_list.append(precision)
        recall_list.append(recall)
        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)
        fpr = roc.select("FPR").toPandas()
        tpr = roc.select("TPR").toPandas()
        plt.plot(fpr, tpr)
        plt.show()
        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()
        plt.plot(pr_precision, pr_recall)
        plt.show()
        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy("label", "prediction").count().show()
        prediction_val.show()
        prediction_val.groupBy("prediction").count().show()
        prediction_val.groupBy("prediction", "probability").count().show()
Beispiel #13
0
    def Logistic_regression(dataset_add, feature_colm, label_colm):

        dataset = spark.read.csv(dataset_add,
                                 header=True,
                                 inferSchema=True,
                                 sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()
        label = ''
        for y in label_colm:
            label = y

        print(label)

        # using the rformula for indexing, encoding and vectorising

        # f = ""
        # f = label + " ~ "
        #
        # for x in features:
        #     f = f + x + "+"
        # f = f[:-1]
        # f = (f)

        # extracting the schema

        val = dataset.schema

        string_features = []
        integer_features = []

        for x in val:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        string_features.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        integer_features.append(x.name)

        print(string_features)
        print(integer_features)
        print(val)
        # print(label)
        # label = 'y'

        for z in val:
            if (z.name == label and str(z.dataType) == "StringType"):
                label_indexer = StringIndexer(inputCol=label,
                                              outputCol='indexed_' +
                                              label).fit(dataset)
                dataset = label_indexer.transform(dataset)
            if (z.name == label and str(z.dataType)
                    == ("IntegerType" or "FloatType" or "DoubleType")):
                dataset = dataset.withColumnRenamed(label, 'indexed_' + label)

        ###########################################################################
        indexed_features = []
        encoded_features = []
        for col in string_features:
            indexer = StringIndexer(inputCol=col,
                                    outputCol='indexed_' + col).fit(dataset)
            indexed_features.append('indexed_' + col)
            dataset = indexer.transform(dataset)
            # dataset.show()
            # encoder = OneHotEncoderEstimator(inputCols=['indexed_'+col], outputCols=['encoded_'+col]).fit(dataset)
            # encoded_features.append('encoded_'+col)
            # dataset = encoder.transform(dataset)
            # dataset.show()

        print(indexed_features)
        print(encoded_features)

        # combining both the features colm together

        final_features = integer_features + indexed_features

        print(final_features)

        # now using the vector assembler

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")

        dataset = featureassembler.transform(dataset)
        dataset.show()

        # combining both the features colm together

        # output.show()
        # output.select("features").show()

        # output_features = dataset.select("features")

        # using the vector indexer (for categorical data kind of one hot encoding)

        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=15).fit(dataset)

        categorical_features = vec_indexer.categoryMaps
        print("Chose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))

        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()

        # preparing the finalized data

        finalized_data = vec_indexed.select('indexed_' + label,
                                            'vec_indexed_features')
        finalized_data.show()

        # formula = RFormula(formula=f,
        #                    featuresCol="features",
        #                    labelCol="label")
        #
        # output = formula.fit(dataset).transform(dataset)
        #
        # output_2 = output.select("features", "label")
        #
        # output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)

        # implementing the logistic regression
        # lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y = 0.1
        # x=[]
        for i in range(0, 3):
            y = round(y + 0.1, 2)

            lr = LogisticRegression(featuresCol='vec_indexed_features',
                                    labelCol='indexed_' + label,
                                    maxIter=5,
                                    regParam=0.1,
                                    elasticNetParam=1.0,
                                    threshold=0.3)

            # fit the model

            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))

            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print(" area under roc : ", training_summary.areaUnderROC)
            print("  roc : ", training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print(" pr value : ", training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print(" precision by threshold : ",
                  training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print(" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print(accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
                'max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print("objectiveHistory")
            for objective in objectiveHistory:
                print(objective)

            # for a multiclass we can inspect  a matrix on a per label basis

            print("false positive rate by label:")
            for i, rate in enumerate(
                    training_summary.falsePositiveRateByLabel):
                print("label %d: %s" % (i, rate))

            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print(
                "Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                % (accuracy, falsePositiveRate, truePositiveRate, fMeasure,
                   precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print(Accuracy_list)
        print(FPR_list)
        print(TPR_list)
        print(precision_list)
        print(recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()

        plt.plot(fpr, tpr)
        plt.show()

        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision, pr_recall)
        plt.show()

        # now applying the fit on the test data

        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy('indexed_' + label, "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()
Beispiel #14
0
    def Logistic_regression(dataset_add, features, label):

        dataset = spark.read.csv(dataset_add, header=True, inferSchema=True, sep=";")

        dataset.show()

        dataset.groupBy("y").count().show()

        # using the rformula for indexing, encoding and vectorising

        f = ""
        f = label + " ~ "

        for x in features:
            f = f + x + "+"
        f = f[:-1]
        f = (f)

        formula = RFormula(formula=f,
                           featuresCol="features",
                           labelCol="label")

        output = formula.fit(dataset).transform(dataset)

        output_2 = output.select("features", "label")

        output_2.show()

        # splitting the dataset into train and test

        train_data, test_data = output_2.randomSplit([0.75, 0.25], seed = 40)

        # implementing the logistic regression
        lr1 =LogisticRegression()

        Accuracy_list = []
        # Accuracy_list.append(accuracy)
        FPR_list = []
        # FPR_list.append(falsePositiveRate)
        TPR_list = []
        precision_list = []
        recall_list = []

        y= 0.1
        # x=[]
        for i in range(0,3):
            y=round(y+0.1,2)

            lr = LogisticRegression(maxIter=5, regParam=0.1, elasticNetParam=1.0, threshold=0.3)



            # fit the model


            lrModel = lr.fit(train_data)
            lrModel

            # print the coefficients and the intercept for the logistic regression

            print ("coefficients:" + str(lrModel.coefficientMatrix))
            # mat = (lrModel.coefficientMatrix)
            # print mat
            print("intercept: " + str(lrModel.interceptVector))





            # getting the summary of the model

            # f-measure calculation
            from pyspark.ml.classification import BinaryLogisticRegressionTrainingSummary

            training_summary = lrModel.summary

            BinaryLogisticRegressionTrainingSummary.accuracy

            print (" area under roc : " , training_summary.areaUnderROC)
            print ("  roc : " , training_summary.roc)
            roc = training_summary.roc
            roc.show()
            print (" pr value : " , training_summary.pr)
            pr = training_summary.pr
            pr.show()
            print (" precision by threshold : " , training_summary.precisionByThreshold)
            prec_by_threshold = training_summary.precisionByThreshold
            prec_by_threshold.show()

            print (" accuracy : ", training_summary.accuracy)
            accuracy_d = training_summary.accuracy
            print (accuracy_d)

            fMeasure = training_summary.fMeasureByThreshold

            fMeasure.show()

            maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
            bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
                .select('threshold').head()['threshold']
            lr.setThreshold(bestThreshold)

            # obtain the objective per iteration

            objectiveHistory = training_summary.objectiveHistory
            print ("objectiveHistory")
            for objective in objectiveHistory:
                print (objective)


            # for a multiclass we can inspect  a matrix on a per label basis

            print ("false positive rate by label:")
            for i, rate in enumerate(training_summary.falsePositiveRateByLabel):
                print ("label %d: %s" % (i, rate))


            print("True positive rate")
            for i, rate in enumerate(training_summary.truePositiveRateByLabel):
                print ("label %d : %s" % (i, rate))
            #
            # print("True Negative rate")
            # for i, rate in enumerate(training_summary)

            print("Precision by label:")
            for i, prec in enumerate(training_summary.precisionByLabel):
                print("label %d: %s" % (i, prec))

            print("Recall by label:")
            for i, rec in enumerate(training_summary.recallByLabel):
                print("label %d: %s" % (i, rec))

            print("F-measure by label:")
            for i, f in enumerate(training_summary.fMeasureByLabel()):
                print("label %d: %s" % (i, f))

            accuracy = training_summary.accuracy
            falsePositiveRate = training_summary.weightedFalsePositiveRate
            truePositiveRate = training_summary.weightedTruePositiveRate
            fMeasure = training_summary.weightedFMeasure()
            precision = training_summary.weightedPrecision
            recall = training_summary.weightedRecall
            print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
                  % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))
            # Accuracy_list = []
            Accuracy_list.append(accuracy)
            # FPR_list = []
            FPR_list.append(falsePositiveRate)
            # TPR_list=[]
            TPR_list.append(truePositiveRate)
            precision_list.append(precision)
            recall_list.append(recall)

        print (Accuracy_list)
        print (FPR_list)
        print (TPR_list)
        print (precision_list)
        print (recall_list)

        import matplotlib.pyplot as plt
        #
        # plt.plot(recall_list, FPR_list)
        # plt.show()

        #
        # fpr = [0.0,0.0,0.0,0.0,0.003067484662576687, 0.003067484662576687, 0.006134969325153374, 0.11042944785276074, 0.1165644171779141, 0.1165644171779141, 0.23006134969325154, 0.9723926380368099, 0.9846625766871165 ]
        # tpr = [0.0, 0.09767441860465116, 0.10232558139534884, 0.13488372093023257 ,0.17674418604651163 ,0.3674418604651163 , 0.37209302325581395  , 0.7534883720930232, 0.8651162790697674 , 0.8697674418604651 , 0.9069767441860465, 0.9953488372093023, 1.0]
        # data visualization

        # ROC graph
        fpr = roc.select("FPR").toPandas()

        tpr = roc.select("TPR").toPandas()


        plt.plot(fpr, tpr)
        plt.show()


        # PR graph

        pr_recall = pr.select("recall").toPandas()
        pr_precision = pr.select("precision").toPandas()

        plt.plot(pr_precision,pr_recall)
        plt.show()


        # now applying the fit on the test data


        prediction_val = lrModel.transform(test_data)
        prediction_val.groupBy("label", "prediction").count().show()
        prediction_val.show()

        prediction_val.groupBy("prediction").count().show()

        prediction_val.groupBy("prediction", "probability").count().show()
Beispiel #15
0
# set best threshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select(
    'max(F-Measure)').head()['max(F-Measure)']
print "maxFMeasure"
print maxFMeasure
fMeasure_new = spark.createDataFrame(fMeasure.rdd, ['threshold', 'fmeasure'])
bestThreshold = fMeasure_new.where(
    maxFMeasure - fMeasure_new.fmeasure < 0.00001).select('threshold').head()
print "bestThreshold"
print bestThreshold
print "original threshold"
print lr.getThreshold()

#lr.setThreshold(bestThreshold['threshold'])
lr.setThreshold(-3)
print "new threshold"
print lr.getThreshold()

#lr.setThreshold(bestThreshold)
# $example off$
#test
result_all = lrModel.transform(training)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
res = evaluator.evaluate(result_all)
print "evaluator res:"
print res
res = evaluator.evaluate(result_all, {evaluator.metricName: "areaUnderPR"})
print "evaluator pr res:"
print res