Ejemplo n.º 1
0
def main(context):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    context.udf.register("three", remove_first_three)

    # TASK 1
    # Code for task 1

    # comments = sqlContext.read.json("comments-minimal.json.bz2")
    # submissions = sqlContext.read.json("submissions.json.bz2")

    # comments = comments.sample(True, 0.2, None)
    # submissions = submissions.sample(True, 0.2, None)

    comments = sqlContext.read.load("comments_data.parquet")
    submissions = sqlContext.read.load("submissions_data.parquet")

    # TASK 2
    # Code for task 2

    # code for task 2 continues to the join in tasks 4, 5

    labeled_data = context.read.format('csv').options(
        header="true").load('labeled_data.csv')
    labeled = labeled_data.select(
        col("`Input.id`").alias("id"),
        col("labeldjt").alias("trump"))

    comments.createOrReplaceTempView('comments')
    submissions.createOrReplaceTempView('submissions')
    labeled.createOrReplaceTempView('labeled')

    # TASKS 4, 5
    # Code for tasks 4 and 5

    context.udf.register("sanitize", san)
    sanitized = sqlContext.sql(
        'select sanitize(body) as san, if(trump = 1, 1, 0) as positive, '
        'if(trump = -1, 1, 0) as negative from comments inner join labeled on comments.id = labeled.id'
    )
    sanitized = sanitized.withColumn(
        "san",
        split(col("san"), " ").cast("array<string>").alias("san"))

    # TASKS 6A, 6B
    # Code for tasks 6A and 6B

    cv = CountVectorizer(inputCol="san",
                         outputCol="features",
                         binary=True,
                         minDF=5.0)
    cvmodel = cv.fit(sanitized)
    result = cvmodel.transform(sanitized)
    result.createOrReplaceTempView('results')

    pos = sqlContext.sql('select positive as label, features from results')
    neg = sqlContext.sql('select negative as label, features from results')

    # TASK 7
    # Code for task 7

    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10).setThreshold(0.2)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10).setThreshold(0.25)

    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()

    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()

    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)

    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])

    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # posModel.save("pos.model")
    # negModel.save("neg.model")

    # posModel = CrossValidatorModel.load("pos.model")
    # negModel = CrossValidatorModel.load("neg.model")

    # TASK 8
    # Code for task 8

    whole = sqlContext.sql(
        'select comments.id as id, comments.author_flair_text as state, '
        'comments.created_utc as time, comments.gilded as gilded, submissions.title as title, submissions.score as story_score, '
        'comments.score as comment_score, sanitize(body) as san from comments inner join '
        'submissions on submissions.id = three(comments.link_id) where body not like "&gt%" '
        'and body not like "%\\s%"')
    whole = whole.withColumn(
        "san",
        split(col("san"), " ").cast("array<string>").alias("san"))
    # whole.write.parquet("whole.parquet")
    # whole = sqlContext.read.load("whole.parquet")

    # TASK 9
    # Code for task 9

    whole_result_pos = cvmodel.transform(whole)
    whole_result_neg = whole_result_pos.select("*")

    pos_ans = posModel.transform(whole_result_pos)
    neg_ans = negModel.transform(whole_result_neg)
    pos_ans.createOrReplaceTempView('positive')
    neg_ans.createOrReplaceTempView('negative')

    # TASK 10
    # Code for task 10

    final = sqlContext.sql(
        'select positive.id as id, positive.state as state, positive.gilded as gilded, '
        'positive.time as time, positive.title as title, positive.comment_score as comment_score, '
        'positive.story_score as story_score, positive.prediction as pos, negative.prediction as neg '
        'from positive inner join negative on positive.id = negative.id')
    # final.write.parquet("final.parquet")
    # final = sqlContext.read.load('final.parquet')
    final.createOrReplaceTempView('final')

    states = [
        'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
        'Connecticut', 'Delaware', 'District of Columbia', 'Florida',
        'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
        'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
        'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
        'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
        'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
        'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
        'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
        'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
    ]
    statesdf = sqlContext.createDataFrame(states, StringType())
    statesdf.createOrReplaceTempView('states')

    percentage_total = sqlContext.sql(
        'select title, 100 * sum(pos) / count(id) as Positive, '
        '100 * sum(neg) / count(id) as Negative from final group by title')
    percentage_day = sqlContext.sql(
        'select Date(from_unixtime(time)) as date, '
        '100 * sum(pos) / count(id) as Positive, 100 * sum(neg) / count(id) as Negative '
        'from final group by Date(from_unixtime(time)) order by Date(from_unixtime(time))'
    )
    percentage_state = sqlContext.sql(
        'select state, 100 * sum(pos) / count(id) as Positive, '
        '100 * sum(neg) / count(id) as Negative from final inner join states on states.value = final.state group by state'
    )
    percentage_comment_score = sqlContext.sql(
        'select comment_score as comment_score, '
        '100 * sum(pos) / count(id) as Positive, 100 * sum(neg) / count(id) as Negative from final group by comment_score'
    )
    percentage_story_score = sqlContext.sql(
        'select story_score as submission_score, '
        '100 * sum(pos) / count(id) as Positive, 100 * sum(neg) / count(id) as Negative from final group by story_score'
    )
    percentage_gilded = sqlContext.sql(
        'select gilded, 100 * sum(pos) / count(id) as Positive, '
        '100 * sum(neg) / count(id) as Negative from final group by gilded')

    percentage_total.repartition(1).write.format(
        "com.databricks.spark.csv").option("header", "true").save("total.csv")
    percentage_day.repartition(1).write.format(
        "com.databricks.spark.csv").option("header", "true").save("day.csv")
    percentage_state.repartition(1).write.format(
        "com.databricks.spark.csv").option("header", "true").save("state.csv")
    percentage_comment_score.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("comment_score.csv")
    percentage_story_score.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("story_score.csv")

    # EXTRA CREDIT

    percentage_gilded.repartition(1).write.format(
        "com.databricks.spark.csv").option("header", "true").save("gilded.csv")

    pos_testing = posModel.transform(posTest)
    neg_testing = negModel.transform(negTest)

    pos_results = pos_testing.select(['probability', 'label'])
    neg_results = neg_testing.select(['probability', 'label'])

    ## prepare score-label set
    pos_results_collect = pos_results.collect()
    neg_results_collect = neg_results.collect()

    pos_results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                        for i in pos_results_collect]
    pos_scoreAndLabels = sc.parallelize(pos_results_list)

    neg_results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                        for i in neg_results_collect]
    neg_scoreAndLabels = sc.parallelize(neg_results_list)

    pos_metrics = metric(pos_scoreAndLabels)
    print("The positive ROC score is: ", pos_metrics.areaUnderROC)

    neg_metrics = metric(neg_scoreAndLabels)
    print("The negative ROC score is: ", neg_metrics.areaUnderROC)

    import matplotlib as mpl
    mpl.use('agg')
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve, auc

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    y_test = [i[1] for i in pos_results_list]
    y_score = [i[0] for i in pos_results_list]

    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr,
             tpr,
             label='Trump Positive Sentiment; ROC curve (area = %0.2f)' %
             roc_auc)

    y_test = [i[1] for i in neg_results_list]
    y_score = [i[0] for i in neg_results_list]

    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr,
             tpr,
             label='Trump Negative Sentiment; ROC curve (area = %0.2f)' %
             roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig("ROC_Graph")
Ejemplo n.º 2
0
tmp += [assembler, labelIndexer]

pipeline = Pipeline(stages=tmp)

allData = pipeline.fit(trainDF).transform(trainDF)

allData.cache()

trainData, validData = allData.randomSplit([0.8, 0.2], seed=1)

randforest = RF(labelCol="label", featuresCol="features", numTrees=100)

rf_fit = randforest.fit(trainData)

transformed = rf_fit.transform(validData)

results = transformed.select(["probability", "label"])

results_collect = results.collect()

results_list = [(float(i[0][0]), 1.0 - float(i[1])) for i in results_collect]

score = sc.parallelize(results_list)

metrics = metric(score)

print("The ROC score is (numTrees=100): ", metrics.areaUnderROC)

print("complete")
Ejemplo n.º 3
0
def main(context):
    """Main function takes a Spark SQL context."""

    # TASK 1
    # Code for task 1...
    comments_DF = None
    submissions_DF = None
    labeled_data_DF = None

    comments_parquet = os.path.abspath("./comments-minimal.parquet")
    submissions_parquet = os.path.abspath("./submissions.parquet")
    labeled_data_parquet = os.path.abspath("./labeled_data.parquet")

    if (os.path.exists(labeled_data_parquet)):
        labeled_data_DF = context.read.parquet(labeled_data_parquet)
    else:
        labeled_data_DF = context.read.csv("labeled_data.csv", header=True)
        labeled_data_DF.write.parquet(labeled_data_parquet)

    if (os.path.exists(submissions_parquet)):
        submissions_DF = context.read.parquet(submissions_parquet)
    else:
        submissions_DF = context.read.json("submissions.json.bz2")
        submissions_DF.write.parquet(submissions_parquet)

    if (os.path.exists(comments_parquet)):
        comments_DF = context.read.parquet(comments_parquet)
    else:
        comments_DF = context.read.json("comments-minimal.json.bz2")
        comments_DF.write.parquet(comments_parquet)

    # TASK 2
    # Code for task 2...

    labeled_data_DF.createOrReplaceTempView("labeled_data")
    comments_DF.createOrReplaceTempView("comments")
    labeled_comments = context.sql(
        "select comments.id, cast(labeled_data.labeldjt as int) as label, body, author, author_flair_text, link_id, score, created_utc from labeled_data inner join comments on comments.id = labeled_data.Input_id"
    )
    #labeled_comments.select("id", "Input_id").show()
    #labeled_comments.show()

    # TASK 4, 5
    # Code for tasks 4 and 5
    context.udf.register(
        "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(),
                                        sanitize(body)[1:], []),
        ArrayType(StringType()))
    labeled_comments.createOrReplaceTempView("labeled_comments")
    combined = context.sql(
        "select *, sanitize(body) as words from labeled_comments")

    #combined.printSchema()
    #combined.select("body", "words").show()

    # TASK 6A
    # Code for task 6A...
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         minDF=5.0,
                         binary=True,
                         vocabSize=1 << 18)

    vectorize_model = cv.fit(combined)
    vectorized = vectorize_model.transform(combined)
    vectorize_model.write().overwrite().save("www/vector.model")

    # TASK 6B
    # Code for task 6B...
    vectorized.createOrReplaceTempView("vectorized")
    labeled = context.sql(
        "select *, case when label = 1 then 1 else 0 end as poslabel, case when label = -1 then 1 else 0 end as neglabel from vectorized"
    )
    #labeled.show()

    # TASK 7
    # Code for task 7...
    pos = labeled
    neg = labeled

    # Bunch of imports (may need more)
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
    from pyspark.ml.evaluation import BinaryClassificationEvaluator

    posmodel_path = os.path.abspath("www/pos.model")
    negmodel_path = os.path.abspath("www/neg.model")

    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="poslabel",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="neglabel",
                               featuresCol="features",
                               maxIter=10)
    poslr.setThreshold(0.2)
    neglr.setThreshold(0.25)

    #we set threshold here to avoid doing extra sql queries at the end

    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])

    posModel = None
    negModel = None

    # Train the models
    if (os.path.exists(posmodel_path)):
        posModel = CrossValidatorModel.load(posmodel_path)
    else:
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.write().overwrite().save(posmodel_path)

    if (os.path.exists(negmodel_path)):
        negModel = CrossValidatorModel.load(negmodel_path)
    else:
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        negModel.write().overwrite().save(negmodel_path)

    # TEST MODEL
    posResult = posModel.transform(posTest)
    posResult.createOrReplaceTempView("posResult")
    posAccuracy = context.sql(
        "select avg(case when poslabel = prediction then 1 else 0 end) as accuracy from posResult"
    )
    #posAccuracy.show()

    negResult = negModel.transform(negTest)
    negResult.createOrReplaceTempView("negResult")
    negAccuracy = context.sql(
        "select avg(case when neglabel = prediction then 1 else 0 end) as accuracy from negResult"
    )
    #negAccuracy.show()

    # PLOT ROC CURVE
    from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric

    results = posResult.select(['probability', 'poslabel'])
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)
    metrics = metric(scoreAndLabels)

    from sklearn.metrics import roc_curve, auc
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr,
             tpr,
             'g--',
             label='Trump Positive Sentiment, ROC curve (area = %0.2f)' %
             roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")

    results = negResult.select(['probability', 'neglabel'])
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)
    metrics = metric(scoreAndLabels)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr,
             tpr,
             'r--',
             label='Trump Negative Sentiment, ROC curve (area = %0.2f)' %
             roc_auc)
    plt.legend(loc="lower right")
    plt.savefig('trump_ROC.png')

    # TASK 8
    # Code for task 8...
    #get title of submission post
    submissions_DF.createOrReplaceTempView("submissions")
    comments_DF.createOrReplaceTempView("comments")
    whole_data = context.sql(
        "select s.id as submission_id, s.title, s.author_cakeday, s.created_utc, s.author_flair_text, s.over_18, c.controversiality, c.body as body, c.id as comment_id, c.score as comment_score, s.score as story_score from comments c inner join submissions s on s.id = SUBSTR(c.link_id, 4, LENGTH(c.link_id) - 3) where body not like '%/s' and body not like '&gt%'"
    )
    whole_data.show(20)
    sampled = whole_data.sample(False, 0.5, 42)
    #sampled.show(20)

    #whole_data.count()
    #sampled.count()

    # TASK 9
    # Code for task 9...
    context.udf.register(
        "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(),
                                        sanitize(body)[1:], []),
        ArrayType(StringType()))
    sampled.createOrReplaceTempView("sampled")
    combined = context.sql("select *, sanitize(body) as words from sampled")

    combined.printSchema()
    combined = combined.select("sampled.comment_id", "sampled.submission_id",
                               "sampled.title", "sampled.created_utc",
                               "sampled.author_flair_text",
                               "sampled.author_cakeday", "sampled.over_18",
                               "sampled.controversiality", "sampled.body",
                               "words", "sampled.comment_score",
                               "sampled.story_score")
    #combined.show()

    vectorized = vectorize_model.transform(combined)
    vectorized.show()

    posResult = posModel.transform(vectorized)
    posResult = posResult.withColumnRenamed(
        'prediction', 'pos').drop("rawPrediction").drop("probability")
    result = negModel.transform(posResult)
    result = result.withColumnRenamed(
        'prediction', 'neg').drop("rawPrediction").drop("probability")

    temp = result
    temp = temp.drop("body", "words", "features")
    result = result.drop("body", "words", "features", "title")
    #result.show()

    # TASK 10
    # Code for task 10...
    result.createOrReplaceTempView("result")

    #number 1
    totalrows = result.count()
    PosPercentage = result.filter(result.pos == 1.0).count() * 100 / totalrows
    NegPercentage = result.filter(result.neg == 1.0).count() * 100 / totalrows

    print("Positive Percentage: {}%".format(PosPercentage))
    print("Negative Percentage: {}%".format(NegPercentage))

    #number 2
    #https://medium.com/@mrpowers/working-with-dates-and-times-in-spark-491a9747a1d2
    with_time = result.withColumn(
        "date",
        F.from_unixtime(functions.col('created_utc')).cast(DateType()))
    with_time_pos = with_time.groupBy("date").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    with_time_neg = with_time.groupBy("date").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    time_data = with_time_pos.join(with_time_neg, ["date"])

    time_data.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("time_data.csv")

    #number 3
    states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', \
    'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', \
    'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', \
    'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', \
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

    statelist = context.createDataFrame(
        states, StringType()
    )  #create data frame so we can join the two tables together, eliminate any non-state author_flair_text

    #https://stackoverflow.com/questions/40421356/how-to-do-left-outer-join-in-spark-sql
    #https://docs.databricks.com/spark/latest/faq/join-two-dataframes-duplicated-column.html
    #we found that the attribute name for statelist dataframe was "value" by using printSchema()
    new_pos_states = pos_states.join(statelist, ["author_flair_text"], "inner")

    statelist = statelist.withColumnRenamed("value", "state")
    result = result.withColumnRenamed("author_flair_text", "state")
    pos_states = result.groupBy("state").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    new_pos_states = pos_states.join(statelist, ["state"], "inner")
    new_pos_states = new_pos_states.withColumnRenamed(
        "(sum(pos) / count(pos))", "Positive")
    #new_pos_states = new_pos_states.withColumnRenamed("author_flair_text", "state")
    neg_states = result.groupBy("state").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    new_neg_states = neg_states.join(statelist, ["state"], "inner")
    new_neg_states = new_neg_states.withColumnRenamed(
        "(sum(neg) / count(neg))", "Negative")
    #new_neg_states = new_neg_states.withColumnRenamed("author_flair_text", "state")
    #tried doing left_outer initially, but not all author flair text is a state, so need to do inner instead

    state_data = new_pos_states.join(new_neg_states, ["state"], "inner")

    state_data.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("state_data.csv")

    #final deliverable number 4
    commentPos = result.groupBy("comment_score").agg(
        functions.sum(result.pos) / functions.count(
            result.pos))  #for some reason scalar values don't work???
    storyPos = result.groupBy("story_score").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    commentNeg = result.groupBy("comment_score").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    storyNeg = result.groupBy("story_score").agg(
        functions.sum(result.neg) / functions.count(result.neg))

    comment_data = commentPos.join(commentNeg, ["comment_score"])
    submission_data = storyPos.join(storyNeg, ["story_score"])

    comment_data.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("comment_data.csv")
    submission_data.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("submission_data.csv")

    #http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html
    #Final Deliverable part 4
    temp.createOrReplaceTempView("temp")
    top_pos = context.sql(
        "select title, (sum(pos) / count(pos)) as Positive from temp group by title order by Positive desc limit 10"
    )
    top_neg = context.sql(
        "select title, (sum(neg) / count(neg)) as Negative from temp group by title order by Negative desc limit 10"
    )
Ejemplo n.º 4
0
def main(context):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    start = time.time()
    # task 1
    if(read_raw):
        comments = sqlContext.read.json('comments-minimal.json.bz2')
        submissions = sqlContext.read.json('submissions.json.bz2')
        label = sqlContext.read.load('labeled_data.csv', format = 'csv', sep = ',',header="true")
        print("load done")
        comments.write.parquet('comments')
        submissions.write.parquet('submissions')
        label.write.parquet('label')
    else:
        comments = context.read.load('comments')
        submissions = context.read.load('submissions')
        label = context.read.load('label')
    print("task 1 complete: read data")
    #result.show()

    if(training):
        # task 2
        associate = associated(comments, label).select(col('id'), col('body'), col('labeldjt'))
        print("task 2 complete: associate data")

        # task 4, 5
        newColumn = associate.withColumn('ngrams', sanitize_udf(associate['body']))
        print("task 4, 5 complete: generate unigrams")

        # task 6A
        cv = CountVectorizer(inputCol = 'ngrams', outputCol = "features", binary = True)
        model = cv.fit(newColumn)
        tmp = model.transform(newColumn)
        print("task 6A complete: cv model")

        # task 6B
        result = tmp.withColumn('poslabel', F.when(col('labeldjt') == 1, 1).otherwise(0))
        result = result.withColumn('neglabel', F.when(col('labeldjt') == -1, 1).otherwise(0))
        pos = result.select(col('poslabel').alias('label'), col('features'))
        neg = result.select(col('neglabel').alias('label'), col('features'))
        print("task 6B complete: relabel data")

        # task 7
        # Initialize two logistic regression models.
        # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
        poslr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10)
        neglr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10)
        # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        # There are a few parameters associated with logistic regression. We do not know what they are a priori.
        # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
        # We will assume the parameter is 1.0. Grid search takes forever.
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
        # We initialize a 5 fold cross-validation pipeline.
        posCrossval = CrossValidator(
            estimator = poslr,
            evaluator = posEvaluator,
            estimatorParamMaps = posParamGrid,
            numFolds = 5)
        negCrossval = CrossValidator(
            estimator = neglr,
            evaluator = negEvaluator,
            estimatorParamMaps = negParamGrid,
            numFolds = 5)
        # Although crossvalidation creates its own train/test sets for
        # tuning, we still need a labeled test set, because it is not
        # accessible from the crossvalidator (argh!)
        # Split the data 50/50
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])

        # Train the models
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.save("pos.model")
        negModel.save("neg.model")
        model.save("cv.model")
        print("task 7 complete: training")

        # posModel = CrossValidatorModel.load('pos.model')
        # negModel = CrossValidatorModel.load('neg.model')

        # point 7
        pos_trans = posModel.transform(posTest)
        neg_trans = negModel.transform(negTest)

        pos_results = pos_trans.select(['probability', 'label'])
        pos_trans_collect = pos_results.collect()
        pos_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in pos_trans_collect]
        pos_scoreAndLabels = sc.parallelize(pos_trans_results_list)

        pos_metrics = metric(pos_scoreAndLabels)
        print("The ROC score of positive results is: ", pos_metrics.areaUnderROC)

        neg_results = neg_trans.select(['probability', 'label'])
        neg_trans_collect = neg_results.collect()
        neg_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in neg_trans_collect]
        neg_scoreAndLabels = sc.parallelize(neg_trans_results_list)

        neg_metrics = metric(neg_scoreAndLabels)
        print("The ROC score of negative results is: ", neg_metrics.areaUnderROC)

        plot_ROC(pos_trans_results_list, 'positive_results')
        plot_ROC(neg_trans_results_list, 'negative_results')
        print("point 7 complete: ROC")

    else:
        model = CountVectorizerModel.load('cv.model')
        posModel = CrossValidatorModel.load('pos.model')
        negModel = CrossValidatorModel.load('neg.model')
        print("model loaded")

        # task 8
        comments_tmp = comments.select(col('id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('score').alias('com_score'))
        comments_full = comments_tmp.withColumn('link_id', process_id_udf(comments_tmp['link_id']))
        submissions_full = submissions.select(col('id').alias('sub_id'), col('title'), col('score').alias('sub_score'))

        if(joinFull):
            com_sub = comments_full.join(submissions_full, comments_full.link_id == submissions_full.sub_id, 'inner')
            com_sub = com_sub.select(col('id'), col('title'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('com_score'), col('sub_score'))
            com_sub.write.parquet('com_sub')
        else:
            com_sub = context.read.load('com_sub')# .sample(False, 0.01, None)
        print('task 8 complete: comment with submission')

        # task 9
        filtered = com_sub.filter("body NOT LIKE '%/s%' and body NOT LIKE '&gt;%'")
        filtered_result = filtered.withColumn('ngrams', sanitize_udf(filtered['body']))
        feaResult = model.transform(filtered_result).select(col('id'), col('link_id'), col('created_utc'), \
                                    col('features'), col('author_flair_text'), col('com_score'), col('sub_score'), col('title'))
        posResult = posModel.transform(feaResult)
        negResult = negModel.transform(feaResult)
        print("transformed")

        pos = posResult.withColumn('pos', threshold_pos_udf(posResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'pos', 'com_score', 'sub_score', 'title')
        neg = negResult.withColumn('neg', threshold_neg_udf(negResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'neg', 'com_score', 'sub_score', 'title')
        #final_probs = pos.join(neg, pos.id == neg.id_neg, 'inner').select('id', 'created_utc', 'author_flair_text', 'title', 'pos', 'neg')
        #final_probs.show()
        #pos.write.parquet('pos')
        #neg.write.parquet('neg')
        print('task 9 complete: predict')

        # task 10
        # compute 1
        num_rows = pos.count()
        pos_filtered = pos.filter(pos.pos == 1)
        neg_filtered = neg.filter(neg.neg == 1)
        num_pos = pos_filtered.count()
        num_neg = neg_filtered.count()

        print('Percentage of positive comments: {}'.format(num_pos / num_rows))
        print('Percentage of negative comments: {}'.format(num_neg / num_rows))
        print('finish compute 1')

        # compute 2
        pos_time = pos.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType()))
        neg_time = neg.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType()))

        num_pos_time = pos_time.groupBy('time').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('time')
        num_neg_time = neg_time.groupBy('time').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('time')

        num_pos_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_pos_time')
        num_neg_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_neg_time')
        print('finish compute 2')

        # compute 3
        state = sqlContext.createDataFrame(states, StringType())
        pos_state = pos.groupBy('author_flair_text').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive'))
        neg_state = neg.groupBy('author_flair_text').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative'))

        pos_state = pos_state.join(state, pos_state.author_flair_text == state.value, 'inner')
        pos_state = pos_state.na.drop(subset=['value'])
        pos_state = pos_state.select(col('author_flair_text').alias('state'), col('Percentage of positive').alias('Positive'))

        neg_state = neg_state.join(state, neg_state.author_flair_text == state.value, 'inner')
        neg_state = neg_state.na.drop(subset=['value'])
        neg_state = neg_state.select(col('author_flair_text').alias('state'), col('Percentage of negative').alias('Negative'))

        pos_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_state')
        neg_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_state')
        print('finish compute 3')

        # compute 4
        pos_com_score = pos.groupBy('com_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('com_score')
        pos_sub_score = pos.groupBy('sub_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('sub_score')
        neg_com_score = neg.groupBy('com_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('com_score')
        neg_sub_score = neg.groupBy('sub_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('sub_score')

        pos_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_com_score')
        pos_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_sub_score')
        neg_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_com_score')
        neg_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_sub_score')
        print('finish compute 4')

        # compute 5
        pos_story = pos.groupBy('title').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy(F.desc('Percentage of positive')).limit(10)
        neg_story = neg.groupBy('title').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy(F.desc('Percentage of negative')).limit(10)

        pos_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_story')
        neg_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_story')
        print('finish compute 5')

        end = time.time()
        print('time consumed: {}'.format(end - start))
def main(context):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED
    spark = sqlContext

    schema = StructType([
        StructField("id", StringType()),
        StructField("dem", IntegerType()),
        StructField("gop", IntegerType()),
        StructField("djt", IntegerType())
    ])

    comments = sqlContext.read.json("comments-minimal.json.bz2")
    submissions = sqlContext.read.json("submissions.json.bz2")
    labeled_data = sqlContext.read.load("labeled_data.csv",
                                        format="csv",
                                        schema=schema,
                                        header="true")

    comments.select("*").write.save("comments.parquet", format="parquet")
    submissions.select("*").write.save("submissions.parquet", format="parquet")
    labeled_data.select("*").write.save("label.parquet", format="parquet")

    comments = spark.read.load("comments.parquet")
    submissions = spark.read.load("submissions.parquet")
    labeled_data = sqlContext.read.load("label.parquet")

    comments.createOrReplaceTempView("commentsView")
    submissions.createOrReplaceTempView("submissionsView")
    labeled_data.createOrReplaceTempView("labeled_dataView")

    sqlDF = spark.sql(
        "SELECT * FROM labeled_dataView l JOIN commentsView c ON l.id = c.id")
    sqlDF.show()

    test = spark.sql(
        "SELECT cleanTextWithPython(body) as grams FROM commentsView")
    test.show()

    res = spark.sql(
        "SELECT unionTextWithPython(cleanTextWithPython(c.body)) as grams, l.dem, l.gop, l.djt FROM commentsView c, labeled_dataView l where c.id = l.id"
    )
    res.show()

    cv = CountVectorizer(inputCol="grams",
                         outputCol="features",
                         binary=True,
                         minDF=5.0)
    model = cv.fit(res)
    result = model.transform(res)
    result.show()

    result.createOrReplaceTempView("resultView")
    ans = spark.sql(
        "SELECT features, IF(djt = 1, 1, 0) as positive,  IF(djt = -1, 1, 0) as negative from resultView"
    )

    ans.createOrReplaceTempView("pos1")
    ans.createOrReplaceTempView("neg1")

    pos = spark.sql("SELECT features, positive as label from pos1")
    neg = spark.sql("SELECT features, negative as label from neg1")

    poslr = (LogisticRegression(labelCol="label",
                                featuresCol="features",
                                maxIter=10).setThresholds(
                                    [0.8, 0.2]).setThreshold(0.2))
    neglr = (LogisticRegression(labelCol="label",
                                featuresCol="features",
                                maxIter=10).setThresholds(
                                    [0.75, 0.25]).setThreshold(0.25))
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models

    print("Distribution of Pos and Neg in trainingData is: ",
          posTrain.groupBy("label").count().take(3))
    print("Distribution of Pos and Neg in testData is: ",
          posTest.groupBy("label").count().take(3))

    # print("Training positive classifier...")
    # posModel = posCrossval.fit(posTrain)
    # print("Training negative classifier...")
    # negModel = negCrossval.fit(negTrain)

    # # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    # posModel.save("pos.model")
    # negModel.save("neg.model")

    posModel = CrossValidatorModel.load('pos.model')
    negModel = CrossValidatorModel.load('neg.model')

    posTest_res = posModel.transform(posTest)
    negTest_res = negModel.transform(negTest)

    posTest_res.createOrReplaceTempView("posTest_res1")
    spark.sql("SELECT * from posTest_res1 where label <> 0.0").show(50)

    results = posTest_res.select(['probability', 'label'])
    ## prepare score-label set
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)

    metrics = metric(scoreAndLabels)
    print("The ROC score is: ", metrics.areaUnderROC)

    task8 = spark.sql(
        "SELECT s.id, c.created_utc, s.title, c.author_flair_text, c.body FROM commentsView c join submissionsView s on removeheadWithPython(c.link_id) = s.id"
    )
    task8.createOrReplaceTempView("task8_view")
    test_res = spark.sql(
        "SELECT *, unionTextWithPython(cleanTextWithPython(removeSthWithPython(body))) as grams FROM task8_view where removeSthWithPython(body) <> '&gt' "
    )
def get_auc_score(scores_and_labels_data_set):
    return metric(scores_and_labels_data_set).areaUnderROC
import org.apache.spark.ml.regression.LinearRegressionModel
#Load the fitted model back 
sameCVModel = LinearRegressionModel.load("/tmp/ml_ouput/test_model")




# To VISUALIZE 

from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric
results = predictionTest.select(['probability', 'Y'])
## prepare score-label set
results_collect = results.collect()
results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in results_collect]
scoreAndLabels = sc.parallelize(results_list)
metrics = metric(scoreAndLabels)
print("The ROC score is (@numTrees=200): ", metrics.areaUnderROC)


from sklearn.metrics import roc_curve, auc
fpr = dict()
tpr = dict()
roc_auc = dict()
 
y_test = [i[1] for i in results_list]
y_score = [i[0] for i in results_list]
 
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

from pylab import *