예제 #1
0
def sanitize_1(text):
    parsed, unigrams, bigrams, trigrams = cleantext.sanitize(text)
    unigram_array = unigrams.split(" ")
    bigram_array = bigrams.split(" ")
    trigram_array = trigrams.split(" ")

    joined_grams = unigram_array + bigram_array + trigram_array
    return joined_grams
예제 #2
0
def clean_wrapper(body):
    parsed_text, unigrams, bigrams, trigrams = cleantext.sanitize(body)
    unigrams = unigrams.split(" ")
    bigrams = bigrams.split(" ")
    trigrams = trigrams.split(" ")
    ret = unigrams
    ret.extend(bigrams)
    ret.extend(trigrams)
    return ret
 def parse(z):
     res1 = []
     res2 = []
     wordList = sanitize(z)
     for i, val in enumerate(wordList[1:]):
         res1.append(val)
     for i, value in enumerate(wordList[1:]):
         for j, val in enumerate((value.split(" "))):
             res2.append(val)
     return res1 + res2
def modified_sanitize(text):
    """
    Function that takes the output of sanitize and returns the output desired
    by project 2b specs
    """
    _, unigrams, bigrams, trigrams = sanitize(text)
    ret = unigrams.split()
    ret.extend(bigrams.split())
    ret.extend(trigrams.split())
    # return unigrams + bigrams + trigrams
    return ret
예제 #5
0
def main(context):
    """Main function takes a Spark SQL context."""

    # TASK 1
    # Code for task 1...
    comments_DF = None
    submissions_DF = None
    labeled_data_DF = None

    comments_parquet = os.path.abspath("./comments-minimal.parquet")
    submissions_parquet = os.path.abspath("./submissions.parquet")
    labeled_data_parquet = os.path.abspath("./labeled_data.parquet")

    if (os.path.exists(labeled_data_parquet)):
        labeled_data_DF = context.read.parquet(labeled_data_parquet)
    else:
        labeled_data_DF = context.read.csv("labeled_data.csv", header=True)
        labeled_data_DF.write.parquet(labeled_data_parquet)

    if (os.path.exists(submissions_parquet)):
        submissions_DF = context.read.parquet(submissions_parquet)
    else:
        submissions_DF = context.read.json("submissions.json.bz2")
        submissions_DF.write.parquet(submissions_parquet)

    if (os.path.exists(comments_parquet)):
        comments_DF = context.read.parquet(comments_parquet)
    else:
        comments_DF = context.read.json("comments-minimal.json.bz2")
        comments_DF.write.parquet(comments_parquet)

    # TASK 2
    # Code for task 2...

    labeled_data_DF.createOrReplaceTempView("labeled_data")
    comments_DF.createOrReplaceTempView("comments")
    labeled_comments = context.sql(
        "select comments.id, cast(labeled_data.labeldjt as int) as label, body, author, author_flair_text, link_id, score, created_utc from labeled_data inner join comments on comments.id = labeled_data.Input_id"
    )
    #labeled_comments.select("id", "Input_id").show()
    #labeled_comments.show()

    # TASK 4, 5
    # Code for tasks 4 and 5
    context.udf.register(
        "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(),
                                        sanitize(body)[1:], []),
        ArrayType(StringType()))
    labeled_comments.createOrReplaceTempView("labeled_comments")
    combined = context.sql(
        "select *, sanitize(body) as words from labeled_comments")

    #combined.printSchema()
    #combined.select("body", "words").show()

    # TASK 6A
    # Code for task 6A...
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         minDF=5.0,
                         binary=True,
                         vocabSize=1 << 18)

    vectorize_model = cv.fit(combined)
    vectorized = vectorize_model.transform(combined)
    vectorize_model.write().overwrite().save("www/vector.model")

    # TASK 6B
    # Code for task 6B...
    vectorized.createOrReplaceTempView("vectorized")
    labeled = context.sql(
        "select *, case when label = 1 then 1 else 0 end as poslabel, case when label = -1 then 1 else 0 end as neglabel from vectorized"
    )
    #labeled.show()

    # TASK 7
    # Code for task 7...
    pos = labeled
    neg = labeled

    # Bunch of imports (may need more)
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
    from pyspark.ml.evaluation import BinaryClassificationEvaluator

    posmodel_path = os.path.abspath("www/pos.model")
    negmodel_path = os.path.abspath("www/neg.model")

    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="poslabel",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="neglabel",
                               featuresCol="features",
                               maxIter=10)
    poslr.setThreshold(0.2)
    neglr.setThreshold(0.25)

    #we set threshold here to avoid doing extra sql queries at the end

    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])

    posModel = None
    negModel = None

    # Train the models
    if (os.path.exists(posmodel_path)):
        posModel = CrossValidatorModel.load(posmodel_path)
    else:
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.write().overwrite().save(posmodel_path)

    if (os.path.exists(negmodel_path)):
        negModel = CrossValidatorModel.load(negmodel_path)
    else:
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        negModel.write().overwrite().save(negmodel_path)

    # TEST MODEL
    posResult = posModel.transform(posTest)
    posResult.createOrReplaceTempView("posResult")
    posAccuracy = context.sql(
        "select avg(case when poslabel = prediction then 1 else 0 end) as accuracy from posResult"
    )
    #posAccuracy.show()

    negResult = negModel.transform(negTest)
    negResult.createOrReplaceTempView("negResult")
    negAccuracy = context.sql(
        "select avg(case when neglabel = prediction then 1 else 0 end) as accuracy from negResult"
    )
    #negAccuracy.show()

    # PLOT ROC CURVE
    from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric

    results = posResult.select(['probability', 'poslabel'])
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)
    metrics = metric(scoreAndLabels)

    from sklearn.metrics import roc_curve, auc
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr,
             tpr,
             'g--',
             label='Trump Positive Sentiment, ROC curve (area = %0.2f)' %
             roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")

    results = negResult.select(['probability', 'neglabel'])
    results_collect = results.collect()
    results_list = [(float(i[0][0]), 1.0 - float(i[1]))
                    for i in results_collect]
    scoreAndLabels = sc.parallelize(results_list)
    metrics = metric(scoreAndLabels)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr,
             tpr,
             'r--',
             label='Trump Negative Sentiment, ROC curve (area = %0.2f)' %
             roc_auc)
    plt.legend(loc="lower right")
    plt.savefig('trump_ROC.png')

    # TASK 8
    # Code for task 8...
    #get title of submission post
    submissions_DF.createOrReplaceTempView("submissions")
    comments_DF.createOrReplaceTempView("comments")
    whole_data = context.sql(
        "select s.id as submission_id, s.title, s.author_cakeday, s.created_utc, s.author_flair_text, s.over_18, c.controversiality, c.body as body, c.id as comment_id, c.score as comment_score, s.score as story_score from comments c inner join submissions s on s.id = SUBSTR(c.link_id, 4, LENGTH(c.link_id) - 3) where body not like '%/s' and body not like '&gt%'"
    )
    whole_data.show(20)
    sampled = whole_data.sample(False, 0.5, 42)
    #sampled.show(20)

    #whole_data.count()
    #sampled.count()

    # TASK 9
    # Code for task 9...
    context.udf.register(
        "sanitize", lambda body: reduce(lambda acc, elem: acc + elem.split(),
                                        sanitize(body)[1:], []),
        ArrayType(StringType()))
    sampled.createOrReplaceTempView("sampled")
    combined = context.sql("select *, sanitize(body) as words from sampled")

    combined.printSchema()
    combined = combined.select("sampled.comment_id", "sampled.submission_id",
                               "sampled.title", "sampled.created_utc",
                               "sampled.author_flair_text",
                               "sampled.author_cakeday", "sampled.over_18",
                               "sampled.controversiality", "sampled.body",
                               "words", "sampled.comment_score",
                               "sampled.story_score")
    #combined.show()

    vectorized = vectorize_model.transform(combined)
    vectorized.show()

    posResult = posModel.transform(vectorized)
    posResult = posResult.withColumnRenamed(
        'prediction', 'pos').drop("rawPrediction").drop("probability")
    result = negModel.transform(posResult)
    result = result.withColumnRenamed(
        'prediction', 'neg').drop("rawPrediction").drop("probability")

    temp = result
    temp = temp.drop("body", "words", "features")
    result = result.drop("body", "words", "features", "title")
    #result.show()

    # TASK 10
    # Code for task 10...
    result.createOrReplaceTempView("result")

    #number 1
    totalrows = result.count()
    PosPercentage = result.filter(result.pos == 1.0).count() * 100 / totalrows
    NegPercentage = result.filter(result.neg == 1.0).count() * 100 / totalrows

    print("Positive Percentage: {}%".format(PosPercentage))
    print("Negative Percentage: {}%".format(NegPercentage))

    #number 2
    #https://medium.com/@mrpowers/working-with-dates-and-times-in-spark-491a9747a1d2
    with_time = result.withColumn(
        "date",
        F.from_unixtime(functions.col('created_utc')).cast(DateType()))
    with_time_pos = with_time.groupBy("date").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    with_time_neg = with_time.groupBy("date").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    time_data = with_time_pos.join(with_time_neg, ["date"])

    time_data.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("time_data.csv")

    #number 3
    states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', \
    'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', \
    'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', \
    'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', \
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

    statelist = context.createDataFrame(
        states, StringType()
    )  #create data frame so we can join the two tables together, eliminate any non-state author_flair_text

    #https://stackoverflow.com/questions/40421356/how-to-do-left-outer-join-in-spark-sql
    #https://docs.databricks.com/spark/latest/faq/join-two-dataframes-duplicated-column.html
    #we found that the attribute name for statelist dataframe was "value" by using printSchema()
    new_pos_states = pos_states.join(statelist, ["author_flair_text"], "inner")

    statelist = statelist.withColumnRenamed("value", "state")
    result = result.withColumnRenamed("author_flair_text", "state")
    pos_states = result.groupBy("state").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    new_pos_states = pos_states.join(statelist, ["state"], "inner")
    new_pos_states = new_pos_states.withColumnRenamed(
        "(sum(pos) / count(pos))", "Positive")
    #new_pos_states = new_pos_states.withColumnRenamed("author_flair_text", "state")
    neg_states = result.groupBy("state").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    new_neg_states = neg_states.join(statelist, ["state"], "inner")
    new_neg_states = new_neg_states.withColumnRenamed(
        "(sum(neg) / count(neg))", "Negative")
    #new_neg_states = new_neg_states.withColumnRenamed("author_flair_text", "state")
    #tried doing left_outer initially, but not all author flair text is a state, so need to do inner instead

    state_data = new_pos_states.join(new_neg_states, ["state"], "inner")

    state_data.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("state_data.csv")

    #final deliverable number 4
    commentPos = result.groupBy("comment_score").agg(
        functions.sum(result.pos) / functions.count(
            result.pos))  #for some reason scalar values don't work???
    storyPos = result.groupBy("story_score").agg(
        functions.sum(result.pos) / functions.count(result.pos))
    commentNeg = result.groupBy("comment_score").agg(
        functions.sum(result.neg) / functions.count(result.neg))
    storyNeg = result.groupBy("story_score").agg(
        functions.sum(result.neg) / functions.count(result.neg))

    comment_data = commentPos.join(commentNeg, ["comment_score"])
    submission_data = storyPos.join(storyNeg, ["story_score"])

    comment_data.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("comment_data.csv")
    submission_data.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("submission_data.csv")

    #http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html
    #Final Deliverable part 4
    temp.createOrReplaceTempView("temp")
    top_pos = context.sql(
        "select title, (sum(pos) / count(pos)) as Positive from temp group by title order by Positive desc limit 10"
    )
    top_neg = context.sql(
        "select title, (sum(neg) / count(neg)) as Negative from temp group by title order by Negative desc limit 10"
    )
예제 #6
0
def preprocess(comment):
    _, unigrams, bigrams, trigrams = sanitize(comment)
    processed_comment = unigrams.split() + bigrams.split() + trigrams.split()
    return processed_comment
예제 #7
0
def m_UDF(input):
    list = sanitize(input)
    return list[1].split() + list[2].split() + list[3].split()
예제 #8
0
def clean(body):
    parsed_text, unigrams, bigrams, trigrams = cleantext.sanitize(body)
    retMe = unigrams.split(" ")
    retMe.extend(bigrams.split(" "))
    retMe.extend(trigrams.split(" "))
    return retMe
예제 #9
0
def makeNGrams(body):
    list = cleantext.sanitize(body)
    return list
예제 #10
0
def get_ngrams(text):
    ngrams = []
    temp = cleantext.sanitize(text)
    ngrams = temp[1] + ' ' + temp[2] + ' ' + temp[3]
    return ngrams.split(' ')
예제 #11
0
 def sanitize_new(text):
     r = sanitize(text)[1:]
     return r[0].split(" ") + r[1].split(" ") + r[2].split(" ")
예제 #12
0
def u_sanitize(text):
    return cleantext.sanitize(text)
예제 #13
0
def sanitize_wrapper(text):
    ret_list = sanitize(text)
    unigrams, bigrams, trigrams = ret_list[1:]
    return unigrams.split(" ") + bigrams.split(" ") + trigrams.split(" ")
예제 #14
0
def sanitizeX(s):
    w = cln.sanitize(s)  # sanitize takes a string (commment) and returns a list
    return w[1].split(' ') + w[2].split(' ') + w[3].split(' ')
예제 #15
0
def san(text):
    return " ".join(cleantext.sanitize(text)[1:3])
예제 #16
0
def get_combined_ngrams(comment_body):
    results = sanitize(comment_body)
    unigrams = results[1].split(' ')
    bigrams = results[2].split(' ')
    trigrams = results[3].split(' ')
    return (unigrams + bigrams + trigrams)
예제 #17
0
def sanitize_and_concat(line):
    result = cleantext.sanitize(line)
    re = result[1:]
    re = list(chain(*re))

    return re