コード例 #1
0
ファイル: hottracks.py プロジェクト: jprorama/mpdmine
# ## Load and prep data
#
# * Load the full data set
# * Load the picked k=100 approx Nearest Neighbor results
# * Build song recommdations based on songs in nearest playlist

# In[3]:

mpd_all = mpd.load(spark, "onebig", 1)

# Get the ranked popularity of songs in the mpd.

# In[32]:

cv = CountVectorizer(inputCol="track_uri",
                     outputCol="features",
                     minDF=2,
                     vocabSize=2000000)

# In[33]:

model = cv.fit(mpd_all.select("pid", "tracks.track_uri"))

# In[35]:

result = model.transform(mpd_all.select("pid", "tracks.track_uri"))

# In[36]:

#model, result = mpd.vectorizecol(mpd_all.select("pid", "tracks.track_uri"), "track_uri", "features", 2000000)

# In[37]:
コード例 #2
0
This portion of the code creates topics and associated words 
using Latent Dirichlet Allocation
@author: [email protected]
"""
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import CountVectorizer
from pyspark.mllib.clustering import LDA, LDAModel

sqlContext = SQLContext(sc)
path = "./advisorconversations/advsisortext.txt"

data = sc.textFile(path).zipWithIndex().map(
    lambda (words, idd): Row(idd=idd, words=words.split(" ")))
docDF = sqlContext.createDataFrame(data)

Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus_size = result.count()  # total number of words
corpus = result.select("idd", "vectors").map(lambda (x, y): [x, y]).cache()

# Cluster the documents into four topics using LDA
ldaModel = LDA.train(corpus, k=4, maxIterations=100, optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 50  # number of words per topic
topicIndices = sc.parallelize(
    ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
##################################################
################## Transofrmers ##################
##################################################

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="Descript",
                                outputCol="words",
                                pattern="\\W")
# stop words
add_stopwords = ["http", "https", "amp", "rt", "t", "c",
                 "the"]  # standard stop words
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered",
                               outputCol="features",
                               vocabSize=10000,
                               minDF=5)
label_stringIdx = StringIndexer(inputCol="Category", outputCol="label")
transformers = [
    regexTokenizer, stopwordsRemover, countVectors, label_stringIdx
]
pipeline = Pipeline(stages=transformers)
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

testData.show(5)
コード例 #4
0
ファイル: fms_log_test.py プロジェクト: ljldgup/ml
sc.brao


def remove_low_freq_words(words):
    # print(words)
    rst = list(filter(lambda x: x in high_freq_words, words))
    # print(rst)
    return rst


remove_low_freq_words = udf(remove_low_freq_words, ArrayType(StringType()))

df = df.withColumn('high_freq_words',
                   remove_low_freq_words(col('context_words')))
cv = CountVectorizer(inputCol="high_freq_words",
                     outputCol="words_features",
                     vocabSize=len(high_freq_words))
model = cv.fit(df)
df = model.transform(df)

kmeans = KMeans(featuresCol="words_features",
                predictionCol="kmeans_prediction").setK(100).setSeed(1)
model = kmeans.fit(df)
predictions = model.transform(df)
predictions.select(['context_words', 'high_freq_words',
                    'kmeans_prediction']).show(100, truncate=False)

#################################################################
# 按线程分组统计

コード例 #5
0
def main(review_table,business_table,output_folder):


    #Read reviews and business data
    review_df = spark.read.parquet(review_table)
    review_df.createOrReplaceTempView("reviews_table")

    business_df = spark.read.parquet(business_table)
    business_toronto=business_df.filter(business_df.City=="Toronto")
    business_toronto.createOrReplaceTempView("business_table")

    #collect reviews for each business
    business_review=spark.sql( """ SELECT BusinessID, collect_set(Review) AS total_review FROM reviews_table GROUP BY BusinessID """ )

    #convert reviews in string format
    merge_review = udf(lambda total_review: (" ").join(total_review))
    business_concat_review=business_review.withColumn("comb_review", merge_review(business_review['total_review'])).drop(business_review['total_review'])
    business_concat_review.createOrReplaceTempView("comb_review_table")

    #Keep reviews for business in toronto
    Reviews_for_business=spark.sql(""" SELECT c.BusinessID,b.Name AS BusinessName,b.BusinessStars,c.comb_review FROM comb_review_table AS c INNER JOIN business_table AS b ON c.BusinessID=b.BusinessID """)

    #pipleine to preprocess text data
    regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'comb_review', outputCol = 'token')
    stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'no_stopword')
    countVectorizer = CountVectorizer(inputCol="no_stopword", outputCol="rawcol")
    TDF = IDF(inputCol="rawcol", outputCol="idf_vec")
    text_pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, TDF])

    IDF_model = text_pipeline.fit(Reviews_for_business)
    #IDF_model.write().overwrite().save('IDF_model1')

    #collect the vacabulary from text  from count vectorizer model
    vocab=IDF_model.stages[2].vocabulary

    business_review_df=IDF_model.transform(Reviews_for_business)

    #two business categories base on low and high star rating
    reviews_low=business_review_df.where(business_review_df.BusinessStars<=3)
    reviews_high=business_review_df.where(business_review_df.BusinessStars>3)

    lda = LDA(k=6, seed=123, optimizer='online', featuresCol="idf_vec")
    vocab_word = udf(lambda termIndices: [vocab[idx] for idx in termIndices])

    #topic modelling on low rating business
    lowtopic_model = lda.fit(reviews_low)
    lowtopic_transform=lowtopic_model.transform(reviews_low)
    print("topic distribution for low rating business")
    lowtopic_transform.select('BusinessID','BusinessName','topicDistribution').show(4,False)
    #lowtopic_model.write().overwrite().save('lowtopic_model')
    
    #topic distribution
    low_dist=lowtopic_transform.withColumn('topic_distribution',lowtopic_transform['topicDistribution'].cast('string')).drop('topicDistribution')
    low_dist_df=low_dist.select('BusinessID','BusinessName','topic_distribution')    
    low_dist_df.write.csv(output_folder + '/Topic_low_business_topic_dist',header=True)
    
    #key topics
    lowreview_topics=lowtopic_model.describeTopics() 
    lowreview_topics_concat=lowreview_topics.withColumn("topic_word", vocab_word(lowreview_topics['termIndices']))
    
    
    low_df=lowreview_topics_concat.select('topic','topic_word')
    print("Topics for low rating business")
    low_df.show(6,False)
    low_df.coalesce(1).write.csv(output_folder + '/Topic_low_rating_topic',header=True)

    
    #topic modelling on high rating business
    high_topic_model = lda.fit(reviews_high)
    hightopic_transform=high_topic_model.transform(reviews_high)
    print("topic distribution for high rating business")
    hightopic_transform.select('BusinessID','BusinessName','topicDistribution').show(4,False)
    #high_topic_model.write().overwrite().save('high_topic_model')
    
    #topic distribution
    high_dist=hightopic_transform.withColumn('topic_distribution',hightopic_transform['topicDistribution'].cast('string')).drop('topicDistribution')
    high_dist_df=high_dist.select('BusinessID','BusinessName','topic_distribution')
    high_dist_df.write.csv(output_folder + '/Topic_high_business_topic_dist',header=True)

    #key topic 
    highreview_topics=high_topic_model.describeTopics()
    highreview_topics_concat=highreview_topics.withColumn("topic_word", vocab_word(highreview_topics['termIndices']))
    high_df=highreview_topics_concat.select('topic','topic_word')
    
    print("\nTopics for high rating business")
    high_df.show(6,False)
    high_df.coalesce(1).write.csv(output_folder + '/Topic_high_rating_topic',header=True)
コード例 #6
0
train.groupby('final_status').count().show()
'''
+------------+-----+
|final_status|count|
+------------+-----+
|           0|69629| #0.679
|           1|32853| #0.320
+------------+-----+
'''

#Text columns
#desc,keywords [TEXT]


train_test=train_test.withColumn('keyword_features',split(col('keywords'),'-'))
cv = CountVectorizer(inputCol="keyword_features", outputCol="keyword_features_cv")
model=cv.fit(train_test)
train_test = model.transform(train_test)
train_test.show(truncate=False)




train.columns
#['project_id', 'name', 'desc', 'goal', 'keywords', 'disable_communication_encoded', 'country', 'currency', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'final_status', 'countryindexed', 'country_features', 'currencyindexed', 'currency_features', 'keyword_features', 'keyword_features_cv', 'diff_statechange_deadline', 'diff_created_deadline', 'diff_launched_deadline', 'diff_statechange_launched']

train_test=train_test.withColumn('diff_statechange_deadline',(train_test.state_changed_at-train_test.deadline)/86400)
train_test=train_test.withColumn('diff_created_deadline',(train_test.deadline-train_test.created_at)/86400)
train_test=train_test.withColumn('diff_launched_deadline',(train_test.deadline-train_test.launched_at)/86400)
train_test=train_test.withColumn('diff_statechange_launched',(train_test.state_changed_at-train_test.launched_at)/86400)
コード例 #7
0
def main(context):
    """Main function takes a Spark SQL context."""
    # TASK 1: load data

    # Read from original source files
    comments = context.read.json("comments-minimal.json.bz2")
    submissions = context.read.json("submissions.json.bz2")
    labels = context.read.csv("labeled_data.csv", header=True)
    '''
    # Write to parquet files
    comments.write.parquet("comments.parquet")
    submissions.write.parquet("submissions.parquet")
    labels.write.parquet("labels.parquet")
    # Read from parquet files
    comments = context.read.parquet("comments.parquet")
    submissions = context.read.parquet("submissions.parquet")
    labels= context.read.parquet("labels.parquet")
    '''

    comments = comments.select("id", "body", "created_utc",
                               "author_flair_text", "link_id",
                               col("score").alias("c_score"))
    submissions = submissions.select("title", "id",
                                     col("score").alias("s_score"))
    labels = labels.select("Input_id", "labeldjt")

    # TASK 2: join labeled_data with comments_minimal
    comments_labels = labels.join(comments,
                                  labels.Input_id == comments.id).select(
                                      "id", "body", "created_utc",
                                      "author_flair_text", "link_id",
                                      "labeldjt")

    # TASK 4, 5: Generate unigrams, bigrams, and trigrams for each comment in the labeled data,
    # store all of them into one column and split them by words.
    sanitize_udf = udf(sanitize, ArrayType(StringType()))
    split_udf = udf(split_arr_to_word, ArrayType(StringType()))
    sanitized_table = comments_labels.select("id", "labeldjt", \
            split_udf(sanitize_udf("body")).alias("sanitized_text"))

    # TASK 6A: Turn raw features into a sparse feature vector. Only use tokens that appear more than 10 times.
    cv = CountVectorizer(minDF=10.0,
                         inputCol="sanitized_text",
                         outputCol="vectors")
    cv_table = cv.fit(sanitized_table)
    vec_table = cv_table.transform(sanitized_table)

    # TASK 6B: Add columns for positive and negative labels
    final = vec_table.withColumn("positive", F.when(vec_table.labeldjt == 1, 1).otherwise(0))\
            .withColumn("negative", F.when(vec_table.labeldjt == -1, 1).otherwise(0))
    pos = final.select(col("id"),
                       col("vectors").alias("features"),
                       col("positive").alias("label"))
    neg = final.select(col("id"),
                       col("vectors").alias("features"),
                       col("negative").alias("label"))

    # TASK 7: Initialize two logistic regression models.

    # Code to generate the models:
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)

    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)
    '''
    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.save("project2/pos.model")
    negModel.save("project2/neg.model")
    
    # To load saved models:
    posModel = CrossValidatorModel.load("project2/pos.model")
    negModel = CrossValidatorModel.load("project2/neg.model")
    '''

    # TASK 8: read more parts of comments
    fix_link_udf = udf(remove_first_three, StringType())
    comments_fixed = comments.select(
        col("id").alias("comment_id"),
        fix_link_udf("link_id").alias("link_id"), "created_utc", "body",
        col("author_flair_text").alias("state"), "c_score")
    new_table, = submissions.join(
        comments_fixed,
        comments_fixed.link_id == submissions.id).randomSplit([0.2])

    # TASK 9:
    # remove any comments that contain '\s' or '&gt;'
    new_table = new_table.filter(~new_table.body.contains("&gt;")
                                 & ~new_table.body.contains("\s"))
    # Repeat task 4 and 5 and 6A
    sanitized_new_table = new_table.select("link_id", "state", "comment_id", "body", "created_utc", \
            "title", split_udf(sanitize_udf("body")).alias("sanitized_text"), "s_score", "c_score")
    final_table = cv_table.transform(sanitized_new_table)

    # Run the models
    ith = udf(ith_, FloatType())
    task9_table = final_table.select("link_id", "state", "comment_id", "body",
                                     "created_utc", "title", "c_score",
                                     "s_score", "sanitized_text",
                                     col("vectors").alias("features"))
    task9_table = posModel.transform(task9_table)
    task9_table = task9_table.withColumn(
        "pos",
        F.when(ith(task9_table.probability, lit(1)) > 0.2,
               1).otherwise(0)).select("link_id", "state", "comment_id",
                                       "body", "c_score", "s_score",
                                       "created_utc", "title", "features",
                                       "pos")
    task9_table = negModel.transform(task9_table)
    task9_table = task9_table.withColumn(
        "neg",
        F.when(ith(task9_table.probability, lit(1)) > 0.25,
               1).otherwise(0)).select("link_id", "state", "comment_id",
                                       "body", "c_score", "s_score",
                                       "created_utc", "title", "pos", "neg")

    # TASK 10: calculate statistics
    # Part 1:
    part1 = task9_table.groupBy().agg(
        F.avg("pos").alias("pos"),
        F.avg("neg").alias("neg"))
    # Part 2:
    part2 = task9_table.groupBy(from_unixtime("created_utc",
                                              "yyyy-MM-dd")).agg(
                                                  F.avg("pos").alias("pos"),
                                                  F.avg("neg").alias("neg"))
    # Part 3:
    states = [
        'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
        'Connecticut', 'Delaware', 'District of Columbia', 'Florida',
        'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
        'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
        'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
        'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
        'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
        'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
        'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
        'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
    ]
    part3 = task9_table.where(col("state").isin(states)).groupBy("state").agg(
        F.avg("pos").alias("pos"),
        F.avg("neg").alias("neg"))
    # Part 4:
    part4 = task9_table.groupBy("title").agg(
        F.avg("pos").alias("pos"),
        F.avg("neg").alias("neg"))
    by_c_score = task9_table.groupBy("c_score").agg(
        F.avg("pos").alias("pos"),
        F.avg("neg").alias("neg"))
    by_s_score = task9_table.groupBy("s_score").agg(
        F.avg("pos").alias("pos"),
        F.avg("neg").alias("neg"))

    # Save the data
    part1.repartition(1).write.format("com.databricks.spark.csv").save(
        "part1.csv")
    part2.repartition(1).write.format("com.databricks.spark.csv").save(
        "part2.csv")
    part3.repartition(1).write.format("com.databricks.spark.csv").save(
        "part3.csv")
    by_c_score.repartition(1).write.format("com.databricks.spark.csv").save(
        "c_score.csv")
    by_s_score.repartition(1).write.format("com.databricks.spark.csv").save(
        "s_score.csv")
    part4.repartition(1).write.format("com.databricks.spark.csv").save(
        "part4.csv")
    part5.repartition(1).write.format("com.databricks.spark.csv").save(
        "part5.csv")

    # Part 5: calculate percentage of positive and negative comments by month
    time = context.read.csv("part2.csv")
    time = time.withColumn("pos", time["_c1"].cast(FloatType()))\
            .withColumn("neg", time["_c2"].cast(FloatType()))\
            .drop("_c1").drop("_c2").na.drop()
    part5 = time.groupBy(F.month(time._c0), F.year(time._c0)).agg(
        F.avg("pos").alias("pos"),
        F.avg("neg").alias("neg"))

    # for plot 4
    part4.sort(col("pos").desc()).limit(10).repartition(1).write.format(
        "com.databricks.spark.csv").save("plot4_pos.csv")
    part4.sort(col("neg").desc()).limit(10).repartition(1).write.format(
        "com.databricks.spark.csv").save("plot4_neg.csv")
コード例 #8
0
            out_vec.append(t_stem)       
    return out_vec

# Create user defined function for stemming with return type Array<String>
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

# Create new df with vectors containing the stemmed tokens 
# Create new df with vectors containing the stemmed tokens 
vector_stemmed_df = (
    cleaned
        .withColumn("vector_stemmed", stemmer_udf("words"))
  )


# vectorize
cv = CountVectorizer(inputCol="vector_stemmed", outputCol="vectors")
print 'done'
count_vectorizer_model = cv.fit(vector_stemmed_df)
print 'done'
result = count_vectorizer_model.transform(vector_stemmed_df)

corpus = result.select(F.col('id').cast("long"), 'vectors').rdd \
    .map(lambda x: [x[0], x[1]])

# Runnign LDA after processing the data
lda_model = LDA.train(rdd=corpus, k=5, seed=12, maxIterations=50)
# extracting topics
topics = lda_model.describeTopics(maxTermsPerTopic=10)
# extraction vocabulary
vocabulary = count_vectorizer_model.vocabulary
コード例 #9
0
ngram.transform(data_token).select('bigram').show(truncate = False)

# Tfidf
from pyspark.ml.feature import HashingTF, IDF
hasing_tf = HashingTF(inputCol = 'tokens', outputCol = 'rawfeatures')
data_token = hasing_tf.transform(data_token)

idf = IDF(inputCol = 'rawfeatures', outputCol = 'features')
idf_model = idf.fit(data_token)

data_tfidf = idf_model.transform(data_token)
data_tfidf.show()

# CountVectorizer
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol = 'token', outputCol = 'features', vocabSize, minDF = 2)

result = cv.fit(data_token).transform(data_token)
result.show()

############# model fitting
tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token')
remover = StopWordsRemover(inputCol = 'token', outputCol = 'token_stop')
cv = CountVectorizer(inputCol = 'token_stop', outputCol = 'token_cv')
idf = IDF(inputCol = 'token_cv', outputCol = 'token_tfidf')
assembler = VectorAssembler(inputCol = ['token_tfidf', 'length'], outputCol = 'features')

indexer_y = StringIndexer(inputCol = 'Survived', outputCol = 'label')

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
コード例 #10
0
def main(sqlContext):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE

    comments = sqlContext.read.json("comments-minimal.json.bz2")
    submissions = sqlContext.read.json("submissions.json.bz2")
    label = sqlContext.read.csv('labeled_data.csv',
                                header=True,
                                inferSchema=True)
    # comments.write.parquet("comments.parquet")
    # submissions.write.parquet("submissions.parquet")
    # label.write.parquet("label.parquet")
    # comments = sqlContext.read.parquet("comments.parquet")
    # comments.createOrReplaceTempView("comments")
    # submissions = sqlContext.read.parquet("submissions.parquet")
    # submissions.createOrReplaceTempView("submissions")
    # label = sqlContext.read.parquet("label.parquet")
    # label.createOrReplaceTempView("label")

    # task 2
    df2 = sqlContext.sql(
        '''SELECT DISTINCT(label.Input_id),comments.*, label.labeldem, label.labelgop, label.labeldjt FROM label INNER JOIN comments ON label.Input_id=comments.id '''
    )

    # Question 1:
    # Input_id -> (labeldem, labelgop, labeldjt)

    # Question 2:
    # The data frame is not normalized, with redundant data. Given a comment id, we can uniquely identify the body of
    # the comment. Given the author URL, we can find the can_gild status. Given author URL and subreddit_id, we can find
    # author flair text and its css class string. We can decompose it into 4NF after identifying some functional
    # dependencies. We believe the collector of the data stored it in this way because it is easier to update one table
    # at a time than to update multiple ones. Also, a user status might change in future time, so the collector collected
    # snapshot of the comments with the most current information.

    # task 4 and 5
    sanitize_udf = udf(cleantext.sanitize, ArrayType(StringType()))
    df4 = df2.withColumn('sanitized', sanitize_udf('body'))
    concat_udf = udf(concat_string_array, ArrayType(StringType()))
    df5 = df4.withColumn('ngram', concat_udf('sanitized'))
    df5 = df5.drop('sanitized')  # 1779
    # df5.write.parquet("df5.parquet")

    # task 6A
    # df5 = sqlContext.read.parquet("df5.parquet")
    cv = CountVectorizer(inputCol="ngram", outputCol="features", minDF=6)
    model = cv.fit(df5)
    # model.save('cv.model')
    result = model.transform(df5)
    # result.show(truncate=False)

    # task 6B
    result.createOrReplaceTempView("result")
    df6_positive = sqlContext.sql(
        "SELECT *, IF(labeldjt==1,1,0) AS label FROM result")
    df6_negative = sqlContext.sql(
        "SELECT *, IF(labeldjt==-1,1,0) AS label FROM result")

    # task 7
    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10).setThreshold(0.2)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10).setThreshold(0.4)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = df6_positive.randomSplit([0.5, 0.5])
    negTrain, negTest = df6_negative.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    # posModel.save("pos.model")
    # negModel.save("neg.model")

    # task 8
    ss = submissions.sample(False, 0.2, None)
    df8 = comments.join(ss,
                        comments.link_id.substr(4, 12) == ss.id).select(
                            comments.created_utc,
                            comments.score.alias('cscore'),
                            ss.score.alias('sscore'), ss.title, ss.locked,
                            ss.over_18, comments.author_flair_text,
                            comments.id, comments.body)

    # Question 3
    # df8.explain()
    # == Physical Plan ==
    # *(2) Project [created_utc#10L, score#20L AS cscore#1619L, score#92L AS sscore#1620L, title#106, locked#76, over_18#83, author_flair_text#3, id#14, body#4]
    # +- *(2) BroadcastHashJoin [substring(link_id#16, 4, 12)], [id#69], Inner, BuildRight
    #    :- *(2) Project [author_flair_text#3, body#4, created_utc#10L, id#14, link_id#16, score#20L]
    #    :  +- *(2) Filter isnotnull(link_id#16)
    #    :     +- *(2) FileScan parquet [author_flair_text#3,body#4,created_utc#10L,id#14,link_id#16,score#20L] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/media/sf_vm-shared/comments.parquet], PartitionFilters: [], PushedFilters: [IsNotNull(link_id)], ReadSchema: struct<author_flair_text:string,body:string,created_utc:bigint,id:string,link_id:string,score:big...
    #    +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false]))
    #       +- *(1) Filter isnotnull(id#69)
    #          +- *(1) Sample 0.0, 0.2, false, 1565934737914995123
    #             +- *(1) FileScan parquet [id#69,locked#76,over_18#83,score#92L,title#106] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/media/sf_vm-shared/submissions.parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:string,locked:boolean,over_18:boolean,score:bigint,title:string>

    # df8 = comments.join(submissions, comments.link_id.substr(4, 12) == submissions.id)\
    #     .select(comments.created_utc, comments.score.alias('cscore'), submissions.score.alias('sscore'), submissions.title, comments.author_flair_text, comments.id, comments.body)
    # df8.write.parquet("df8.parquet")

    # task 9
    # model = CountVectorizerModel.load('cv.model')
    # posModel = CrossValidatorModel.load("pos.model")
    # negModel = CrossValidatorModel.load("neg.model")
    df92 = df8.filter("body NOT LIKE '%/s%'").filter("body NOT LIKE '&gt%'")
    df94 = df92.withColumn('sanitized', sanitize_udf('body'))
    df95 = df94.withColumn('ngram', concat_udf('sanitized'))
    df95 = df95.drop('sanitized')
    df9 = model.transform(df95)
    df9pos = posModel.transform(df9).withColumnRenamed(
        'prediction', 'Positive').drop('rawPrediction', 'probability', 'ngram')
    df10 = negModel.transform(df9pos).withColumnRenamed(
        'prediction', 'Negative').drop('rawPrediction', 'probability',
                                       'features')
    # df10.write.parquet("df10.parquet")
    # df9.write.parquet("df9.parquet")
    # posResult.write.parquet("df9pos.parquet")
    # negResult.write.parquet("df9neg.parquet")

    # task 10
    # df9 = sqlContext.read.parquet("df9.parquet")
    # posResult = sqlContext.read.parquet("df9pos.parquet")
    # negResult = sqlContext.read.parquet("df9neg.parquet")
    states = [
        'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
        'Connecticut', 'Delaware', 'District of Columbia', 'Florida',
        'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
        'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
        'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
        'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
        'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
        'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
        'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
        'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
    ]

    # 1
    df10.groupBy().avg('Positive', 'Negative').show()
    # +------------------+------------------+
    # |     avg(Positive)|     avg(Negative)|
    # +------------------+------------------+
    # |0.3285865077437555|0.4712543825234617|
    # +------------------+------------------+

    # 2
    byDate = df10.select(
        to_date(df10.created_utc.cast('timestamp')).alias('date'),
        df10.Positive,
        df10.Negative).groupBy('date').avg('Positive', 'Negative')
    byDate.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save('time_data.csv')
    # +----------+-------------------+
    # |      date|    avg(prediction)|
    # +----------+-------------------+
    # |2017-08-11| 0.3432203389830508|
    # |2017-09-11| 0.6041666666666666|
    # |2017-01-06|0.45698166431593795|
    # |2017-02-26| 0.2857142857142857|
    # |2017-01-27| 0.4057971014492754|
    # |2017-09-28|              0.495|
    # |2016-12-19|0.31800766283524906|
    # |2016-11-08| 0.3527644230769231|
    # |2017-01-24| 0.4975514201762977|
    # |2017-06-29| 0.3684210526315789|
    # |2017-09-29| 0.4827586206896552|
    # |2017-07-31| 0.4574898785425101|
    # |2017-02-16| 0.4444444444444444|
    # |2017-08-18| 0.4482758620689655|
    # |2017-12-02|0.37116564417177916|
    # |2017-08-14| 0.3838383838383838|
    # |2017-10-23| 0.4175824175824176|
    # |2017-12-25|0.44907407407407407|
    # |2017-04-09| 0.3409090909090909|
    # |2017-03-28| 0.4363143631436314|
    # +----------+-------------------+

    # 3
    byState = df10[df10.author_flair_text.isin(states)].groupBy(
        'author_flair_text').avg('Positive', 'Negative')
    byState.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save('state_data.csv')
    # +-----------------+-------------------+
    # |author_flair_text|    avg(prediction)|
    # +-----------------+-------------------+
    # |             Utah|0.37254901960784315|
    # |           Hawaii|0.42857142857142855|
    # |        Minnesota| 0.3856893542757417|
    # |             Ohio|  0.410427807486631|
    # |           Oregon| 0.4177831912302071|
    # |         Arkansas| 0.3548387096774194|
    # |            Texas|         0.43359375|
    # |     North Dakota| 0.4126984126984127|
    # |     Pennsylvania|0.42705882352941177|
    # |      Connecticut|0.40119760479041916|
    # |          Vermont|0.38028169014084506|
    # |         Nebraska| 0.4528301886792453|
    # |           Nevada| 0.4110429447852761|
    # |       Washington|  0.407436096049574|
    # |         Illinois|0.44341801385681295|
    # |         Oklahoma|               0.43|
    # |         Delaware|                0.4|
    # |           Alaska| 0.4423076923076923|
    # |       New Mexico| 0.5076923076923077|
    # |    West Virginia|                0.5|
    # +-----------------+-------------------+

    # 4
    byCommentScore = df10.groupBy('cscore').avg('Positive', 'Negative')
    byStoryScore = df10.groupBy('sscore').avg('Positive', 'Negative')
    byCommentScore.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save('comment_score.csv')
    byStoryScore.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save('story_score.csv')

    # 5 Locked vs Unlocked
    byDateLocked = df10.filter('locked == true').select(
        to_date(df10.created_utc.cast('timestamp')).alias('date'),
        df10.Positive,
        df10.Negative).groupBy('date').avg('Positive', 'Negative')
    byDateUnlocked = df10.filter('locked == false').select(
        to_date(df10.created_utc.cast('timestamp')).alias('date'),
        df10.Positive,
        df10.Negative).groupBy('date').avg('Positive', 'Negative')
    byDateLocked.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save('locked_data.csv')
    byDateUnlocked.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save('unlocked_data.csv')

    # 5 over_18
    byDate18 = df10.filter('over_18 == true').select(
        to_date(df10.created_utc.cast('timestamp')).alias('date'),
        df10.Positive,
        df10.Negative).groupBy('date').avg('Positive', 'Negative')
    byDate18.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save('over18_data.csv')

    # final 4
    dff4 = df10.groupBy('title').agg(
        avg('Positive').alias('avgPos'),
        avg('Negative').alias('avgNeg'))
    dff4.orderBy('avgPos', ascending=0).limit(10).show(truncate=False)
    dff4.orderBy('avgNeg', ascending=0).limit(10).show(truncate=False)
コード例 #11
0
def main(context):
    """Main Function takes a Spark SQL Context."""
    #---------------------------------------------------------------------------
    # TASK 1
    # Code for task 1...
    # df = context.read.csv('labeled_data.csv')
    # df.write.parquet("labeled_data.parquet")
    # comments = context.read.json("comments-minimal.json.bz2")
    # comments.write.parquet("comments.parquet")
    # submissions = context.read.json("submissions.json.bz2")
    # submissions.write.parquet("submissions.parquet")
    labeled_data = context.read.parquet('labeled_data.parquet')
    labeled_data = labeled_data.withColumnRenamed("_c0", "Input_id")\
                               .withColumnRenamed("_c1", "labeldem")\
                               .withColumnRenamed("_c2", "labelgop")\
                               .withColumnRenamed("_c3", "labeldjt")
    # labeled_data.show()
    comments = context.read.parquet('comments.parquet')
    # comments.show()
    submissions = context.read.parquet('submissions.parquet')
    # submissions.show()

    #---------------------------------------------------------------------------
    # TASK 2
    # Code for task 2...
    labeled_comments = labeled_data.join(comments,
                                         comments.id == labeled_data.Input_id)
    labeled_comments = labeled_comments.select('Input_id', 'labeldjt', 'body')
    # labeled_comments.show()

    #---------------------------------------------------------------------------
    # TASK 4
    # Code for task 4...
    sanitize_udf = udf(sanitize, ArrayType(StringType()))

    #---------------------------------------------------------------------------
    # TASK 5
    # Code for task 5...
    sanitized_labeled_comments = labeled_comments.select(
        'Input_id', 'labeldjt',
        sanitize_udf('body').alias('raw'))

    #---------------------------------------------------------------------------
    # TASK 6A
    # Code for task 6A...
    cv = CountVectorizer(binary=True,
                         minDF=10.0,
                         inputCol="raw",
                         outputCol="features")
    model = cv.fit(sanitized_labeled_comments)
    sanitized_labeled_comments = model.transform(sanitized_labeled_comments)
    sanitized_labeled_comments.show(truncate=False)
    countVectorizerPath = "count_vectorizer_model"
    model.save(countVectorizerPath)

    #---------------------------------------------------------------------------
    # TASK 6B
    # Code for task 6B...
    # Labels: {1, 0, -1, -99}
    pos = sanitized_labeled_comments.select(
        sanitized_labeled_comments.features,
        sanitized_labeled_comments.labeldjt.cast(IntegerType()))
    pos = pos.withColumnRenamed("labeldjt", "label")
    pos = pos.replace(-1, 0)
    pos = pos.replace(-99, 0)
    # pos.show()

    neg = sanitized_labeled_comments.select(
        sanitized_labeled_comments.features,
        sanitized_labeled_comments.labeldjt.cast(IntegerType()))
    neg = neg.withColumnRenamed("labeldjt", "label")
    neg = neg.replace(1, 0)
    neg = neg.replace(-99, 0)
    neg = neg.replace(-1, 1)
    # neg.show()

    #---------------------------------------------------------------------------
    # TASK 7
    # Code for task 7...
    # ... MACHINE LEARNING PORTION TO TRAIN MODELS - Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)
    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.save("project2/pos.model")
    negModel.save("project2/neg.model")

    # Positive Model: posModel
    # Negative Model: negModel

    #---------------------------------------------------------------------------
    # TASK 8
    # Code for task 8...
    # ... Make Final Deliverable for Unseen Data - We don't need labeled_data anymore
    strip_t3_udf = udf(strip_t3, StringType())
    sarcastic_or_quote_udf = udf(sarcastic_or_quote, BooleanType())
    # Get Unseen Data
    sanitized_final_deliverable = comments.select('created_utc', strip_t3_udf(comments.link_id).alias('link_id'), 'author_flair_text', 'id', 'body', 'gilded', sanitize_udf('body').alias('raw'), comments.score.alias('c_score'))\
        .filter(sarcastic_or_quote_udf(comments['body'])) #F.when(comments["body"].rlike('^&gt|\/s'), False).otherwise(True))
    # sanitized_final_deliverable.show()

    #---------------------------------------------------------------------------
    # TASK 9
    # Code for task 9...
    # Load models that we saved on previous runs of this script
    model = CountVectorizerModel.load("count_vectorizer_model")
    posModel = CrossValidatorModel.load("project2/pos.model")
    negModel = CrossValidatorModel.load("project2/neg.model")

    # Sanitize TASK 8 - Run the CountVectorizerModel on TASK 8 Relation
    sanitized_final_deliverable = model.transform(sanitized_final_deliverable)

    # Run classifier on unseen data to get positive labels
    posResult = posModel.transform(sanitized_final_deliverable)
    # Rename the 3 new columns to prevent name conflicts
    posResult = posResult.withColumnRenamed("probability", "probability_pos")\
                         .withColumnRenamed("rawPrediction", "rawPrediction_pos")\
                         .withColumnRenamed("prediction", "prediction_pos")
    # Run the classifier on previous positive result to get negative labels too
    result = negModel.transform(posResult)
    # Rename the 3 new columns to make it easier to see which is which
    result = result.withColumnRenamed("probability", "probability_neg")\
                    .withColumnRenamed("rawPrediction", "rawPrediction_neg")\
                    .withColumnRenamed("prediction", "prediction_neg")

    # UDF functions for predicting label based on thresholds
    predict_pos_udf = udf(predict_pos, IntegerType())
    predict_neg_udf = udf(predict_neg, IntegerType())

    # Make predictions based on probability and threshold:
    result = result.select('created_utc', 'author_flair_text', 'link_id', 'id', 'c_score', 'gilded',\
                                 predict_pos_udf(result.probability_pos).alias('pos'),\
                                 predict_neg_udf(result.probability_neg).alias('neg'))

    result.write.parquet("result.parquet")
    # result.show()

    #---------------------------------------------------------------------------
    # TASK 10
    # Code for task 10...
    # ... Perform Analysis on the Predictions
    result = context.read.parquet("result.parquet")
    submissions = submissions.select('id', 'title',
                                     submissions.score.alias('s_score'))
    result = result.join(submissions,
                         result.link_id == submissions.id)  # .explain()
    result.show()
    context.registerDataFrameAsTable(result, "result")

    # 1. Percentage of Comments that Were Positive/Negative Across ALL Submissions
    task_10_1 = context.sql(
        "SELECT title, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY title"
    )
    task_10_1.show()

    task_10_1.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_1.csv")

    # 2. Percentage of Comments that Were Positive/Negative Across ALL Days
    task_10_2 = context.sql(
        "SELECT FROM_UNIXTIME(created_utc, 'Y-M-d') AS day, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY day ORDER BY day asc"
    )
    task_10_2.show()

    task_10_2.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_2.csv")

    # 3. Percentage of Comments that Were Positive/Negative Across ALL States
    context.registerFunction("check_state_udf", check_state, BooleanType())
    task_10_3 = context.sql(
        "SELECT author_flair_text AS state, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result WHERE check_state_udf(author_flair_text) = True GROUP BY state"
    )
    task_10_3.show()

    task_10_3.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_3.csv")

    # 4A. Percentage of Comments that Were Positive/Negative Across ALL Comments
    task_10_4A = context.sql(
        "SELECT c_score AS comment_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY comment_score"
    )
    task_10_4A.show()

    task_10_4A.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_4A.csv")

    # 4B. Percentage of Comments that Were Positive/Negative Across ALL Story Scores
    task_10_4B = context.sql(
        "SELECT s_score AS submission_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY submission_score"
    )
    task_10_4B.show()

    task_10_4B.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("2task_10_4B.csv")

    #---------------------------------------------------------------------------
    # Extra Credit (Task 10)
    # 1. Percentage of Comments that Were Positive/Negative For Gilded and Non-Gilded Comments
    task_10_extra_credit = context.sql(
        "SELECT gilded, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY gilded"
    )
    task_10_extra_credit.show()

    task_10_extra_credit.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("task_10_extra_credit.csv")
コード例 #12
0
# COMMAND ----------

def find_nearest_books(book_id, num):
  key = data_pca.filter(data_pca.book_id == book_id).select("features").collect()[0][0]
  res = model.approxNearestNeighbors(data_pca, key, num).select("book_id").collect()
  for r in res:
    print(get_book_title(r[0]))

find_nearest_books(100001, 10)

# COMMAND ----------

# DBTITLE 1,Latent Dirichlet allocation
from pyspark.ml.clustering import LDA

vectorizer = CountVectorizer(inputCol="filtered", outputCol="features")
cv = vectorizer.fit(filteredData)
featurizedData = cv.transform(filteredData)

lda = LDA(k=20, maxIter=10)
model = lda.fit(featurizedData)

topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show()

transformed = model.transform(featurizedData)
transformed.show()

# COMMAND ----------
コード例 #13
0
spark.createDataFrame(lista_sentimientos, StringType()).show()
#print(dfSpark.show())

# COMMAND ----------

len_udf = udf(lambda s: len(s), IntegerType())
dfSpark = dfSpark.withColumn("token_count", len_udf(col('refined_tokens')))
dfSpark.orderBy(rand()).show(10)

# COMMAND ----------

from pyspark.ml.feature import CountVectorizer

# COMMAND ----------

count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features')
dfSpark_V = count_vec.fit(dfSpark).transform(dfSpark)
dfSpark_V.select(['refined_tokens', 'token_count', 'features',
                  'Label']).show(10)

# COMMAND ----------

model_df = dfSpark_V.select(['features', 'token_count', 'Label'])

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

# COMMAND ----------

df_assembler = VectorAssembler(inputCols=['features', 'token_count'],
コード例 #14
0
def main(context):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    # TASK 1
    # Load the data into PySpark.

    # For the comments:
    if not os.path.exists("./comments.parquet"):
        comments = context.read.json("comments-minimal.json.bz2")
        comments.write.parquet("comments.parquet")

    # For the submissions:
    if not os.path.exists("./submissions.parquet"):
        submissions = context.read.json("submissions.json.bz2")
        submissions.write.parquet("submissions.parquet")
    #submissions.printSchema()

    # For labelled data:
    if not os.path.exists("./labels.parquet"):
        labels = context.read.format('csv').options(
            header='true', inferSchema='true').load("labeled_data.csv")
        labels.write.parquet("labels.parquet")

    # TASK 2
    # Code for Task 2...
    # For task 2, we will join the labels and comments

    commentsParquet = context.read.parquet("comments.parquet")
    commentsParquet.createOrReplaceTempView("comments")

    labelsParquet = context.read.parquet("labels.parquet")
    labelsParquet.createOrReplaceTempView("labels")

    # Now, compute the join:
    if not os.path.exists("./joinedComments.parquet"):
        joinedComments = context.sql(
            "SELECT labels.Input_id, labels.labeldem, labels.labelgop, labels.labeldjt, body FROM comments JOIN labels on id=Input_id"
        )
        joinedComments.write.parquet("joinedComments.parquet")
    joinedComments = context.read.parquet("joinedComments.parquet")
    joinedComments.createOrReplaceTempView("joinedComments")
    #joinedComments.printSchema()

    # TASK 3
    # NOT NEEDED

    # TASK 4
    # Register the user defined function
    context.registerFunction("sanitize", clean_wrapper,
                             ArrayType(StringType()))

    # TASK 5
    if not os.path.exists("./santized.parquet"):
        sanitizedText = context.sql(
            "SELECT Input_id, labeldem, labelgop, labeldjt, sanitize(body) as body FROM joinedComments"
        )
        sanitizedText.write.parquet("sanitized.parquet")

    # TASK 6A
    sanitizedText = context.read.parquet("sanitized.parquet")
    sanitizedText.createOrReplaceTempView("sanitizedText")
    cv = CountVectorizer(inputCol="body",
                         outputCol="features",
                         minDF=10.0,
                         binary=True)
    fitted = cv.fit(sanitizedText)
    vector = fitted.transform(sanitizedText)
    # TASK 6B
    vector.createOrReplaceTempView("vector")
    pos = context.sql("SELECT *, if(labeldjt=1, 1, 0) AS label FROM vector")
    neg = context.sql("SELECT *, if(labeldjt=-1, 1, 0) AS label FROM vector")

    # TASK 7
    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.save("project2/pos.model")
    negModel.save("project2/neg.model")

    # TASK 8 and TASK 9
    # Create the submissions and comments tables from the parquets:
    if not os.path.exists("sanitizedJoinedData.parquet"):
        submissions = context.read.parquet("submissions.parquet")
        submissions.createOrReplaceTempView("submissions")

        comments = context.read.parquet("comments.parquet")
        comments.createOrReplaceTempView("comments")
        comments = comments.sample(False, 0.2, None)
        joinedData = context.sql(
            "SELECT comments.link_id AS id, comments.body, comments.created_utc, submissions.title, comments.author_flair_text, submissions.score AS submission_score, comments.score as comments_score FROM comments JOIN submissions ON REPLACE(comments.link_id, 't3_', '')=submissions.id AND comments.body NOT LIKE '%/s%' AND comments.body NOT LIKE '&gt%'"
        )
        #joinedData.show(joinedData.count(), False)
        #print(str(joinedData.count()))

        # Repeating earlier tasks: Tasks 4 and 5
        joinedData.createOrReplaceTempView("joinedData")
        # Re-register temporary function since we are forced to:
        context.registerFunction("sanitize", clean_wrapper,
                                 ArrayType(StringType()))
        print("writing sanitized parquet now")
        sanitizedJoinedData = context.sql(
            "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, sanitize(body) AS body FROM joinedData"
        )
        sanitizedJoinedData.write.parquet("sanitizedJoinedData.parquet")

    sanitizedJoinedData = context.read.parquet("sanitizedJoinedData.parquet")
    sanitizedJoinedData = sanitizedJoinedData.sample(False, 0.2, None)
    cv = CountVectorizer(inputCol="body",
                         outputCol="features",
                         minDF=10.0,
                         binary=True)
    newVector = fitted.transform(sanitizedJoinedData)

    seenPosModel = CrossValidatorModel.load("project2/pos.model")
    seenNegModel = CrossValidatorModel.load("project2/neg.model")

    posResult = seenPosModel.transform(newVector)
    posResult = posResult.selectExpr("id", "created_utc", "title",
                                     "author_flair_text", "submission_score",
                                     "comments_score", "body", "features",
                                     "probability as positive_probability")

    cumResult = seenNegModel.transform(posResult)
    cumResult = cumResult.selectExpr("id", "created_utc", "title",
                                     "author_flair_text", "submission_score",
                                     "comments_score", "body", "features",
                                     "positive_probability",
                                     "probability as negative_probability")

    cumResult.createOrReplaceTempView("cumResult")

    context.registerFunction("positiveFunc", positiveUDF, IntegerType())
    context.registerFunction("negativeFunc", negativeUDF, IntegerType())
    cumResult = context.sql(
        "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, body, features, positiveFunc(positive_probability) AS positive_probability,negativeFunc(negative_probability) AS negative_probability FROM cumResult"
    )
    cumResult.write.parquet("cumResult.parquet")

    # TASK 10

    cumResult = context.read.parquet("cumResult.parquet")
    cumResult.createOrReplaceTempView("cumResult")
    # Actual 10.2

    task10_6 = context.sql(
        "SELECT DATE(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created ORDER BY date_created"
    )
    task10_6.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_6.csv")

    # Top 10 posts:

    if not os.path.exists("./task10_top_pos.csv"):
        task10_top_pos = cumResult.groupBy('title')\
            .agg(
                 (F.sum('positive_probability') / F.count(F.lit(1))).alias('pct_pos'),
                 F.count(F.lit(1)).alias('count')
                 )\
                .orderBy(F.desc('pct_pos'), F.desc('count')).limit(10)\
                .select('title', 'pct_pos')
        task10_top_pos.repartition(
            1).write.format("com.databricks.spark.csv").option(
                "header", "true").save("task10_top_pos.csv")
    if not os.path.exists("./task10_top_neg.csv"):
        task10_top_neg = cumResult.groupBy('title')\
            .agg(
                 (F.sum('negative_probability') / F.count(F.lit(1))).alias('pct_neg'),
                 F.count(F.lit(1)).alias('count')
                 )\
                .orderBy(F.desc('pct_neg'), F.desc('count')).limit(10)\
                .select('title', 'pct_neg')
        task10_top_neg.repartition(
            1).write.format("com.databricks.spark.csv").option(
                "header", "true").save("task10_top_neg.csv")

    # 10.1
    # Get the number of records
    totalRows = cumResult.count()
    # Calculate percentages
    task10_1 = context.sql(
        "SELECT SUM(positive_probability)/ {0} AS pos, SUM(negative_probability)/{1} AS neg FROM cumResult"
        .format(totalRows, totalRows))

    # 10.2
    task10_2 = context.sql(
        "SELECT DAYOFWEEK(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created"
    )

    # 10.3
    context.registerFunction("checkStateWrapper", checkState, BooleanType())
    task10_3 = context.sql(
        "SELECT author_flair_text AS state, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult WHERE(checkStateWrapper(author_flair_text)) GROUP BY author_flair_text"
    )

    # 10.4
    task10_4 = context.sql(
        "SELECT comments_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY comments_score"
    )
    task10_5 = context.sql(
        "SELECT submission_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY submission_score"
    )
    #    cumResult.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("cumResults.csv")
    task10_1.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_1.csv")
    task10_2.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_2.csv")
    task10_3.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_3.csv")
    task10_4.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_4.csv")
    task10_5.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_5.csv")
コード例 #15
0
def train_model():
  '''
  if(dataRdd != None):
    print("**************************************************************************************************** Inside train model with new rdd")
    # Read the model
    pipeModel_Prev = PipelineModel.load('sentiment.model')
    
    # regular expression tokenizer
    regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W")

    # bag of words count
    countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5)

    # convert string labels to indexes
    label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

    nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial")

    # convert prediction to the predictedSentiment
    indexToLabels = IndexToString(inputCol = "prediction", outputCol = "predictedSentiment", labels=["bordem","love","relief", "fun", "hate", "neutral", "anger", "happiness", "surpirse","sadness","worry", "empty"])

    # Buidl spark pipeline
    pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx, nb, indexToLabels])

    # Fit the pipelin.
    pipeModel_Next = pipeline.fit(dataRDD)
    pipe_model_new = PipelineModel(stages = [pipeModel_Prev ,pipeModel_Next])
    print("Workinggggggggggggggg")
    pipeModel_New.save("sentiment.model")
  '''
  data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('text_emotion.csv')
  #Drop unused columns
  drop_list = ['tweet_id']
  data = data.select([column for column in data.columns if column not in drop_list]) \
             .where(
                    (data['sentiment'] == 'empty') |
                    (data['sentiment'] == 'sadness') |
                    (data['sentiment'] == 'enthusiam') |
                    (data['sentiment'] == 'worry') |
                    (data['sentiment'] == 'surprise') |
                    (data['sentiment'] == 'love') |
                    (data['sentiment'] == 'hate') |
                    (data['sentiment'] == 'anger') |
                    (data['sentiment'] == 'neutral') |
                    (data['sentiment'] == 'relief') |
                    (data['sentiment'] == 'boredom') |
                    (data['sentiment'] == 'fun') |
                    (data['sentiment'] == 'happiness')) \
             .na.drop(thresh=3)

  data.show(5)

  data.groupBy("sentiment") \
      .count() \
      .orderBy(col("count").desc()) \
      .show()

  # set seed for reproducibility
  (trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 100)
  print("Training Dataset Count: " + str(trainingData.count()))
  print("Test Dataset Count: " + str(testData.count()))

  # regular expression tokenizer
  regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W")

  # bag of words count
  countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5)

  # convert string labels to indexes
  label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

  nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial")

  # convert prediction to the predictedSentiment
  indexToLabels = IndexToString(inputCol = "prediction", outputCol = "predictedSentiment", labels=["bordem","love","relief", "fun", "hate", "neutral", "anger", "happiness", "surpirse","sadness","worry", "empty"])

  # Buidl spark pipeline
  pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx, nb, indexToLabels])

  # Fit the pipelin.
  pipelineFit = pipeline.fit(trainingData)
  predictions = pipelineFit.transform(testData)

  predictions.filter(predictions['prediction'] == 0) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 1) \
      .select("content","sentiment", "predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 2) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 3) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 4) \
      .select("content","sentiment","predictedSentiment", "probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 5) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 6) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 7) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 8) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 9) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 10) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

  predictions.filter(predictions['prediction'] == 11) \
      .select("content","sentiment","predictedSentiment","probability","label","prediction") \
      .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)


  # Retrive F1 accuracy score
  evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label")
  print("F1: %g" % (evaluator.evaluate(predictions)))
  pipelineFit.save("sentiment.model")
title_category = news_data.select('TITLE', 'CATEGORY')

title_category.select('Category').distinct().count()
title_category.groupBy('Category').count().orderBy(col('Count').desc()).show(truncate = False)
title_category.groupBy('TITLE').count().orderBy(col('count').desc()).show(truncate = False)
####
#Top 20 news categories:
#regexp_replace: regular expression replacing!
title_category = title_category.withColumn('only_str', regexp_replace(col('TITLE'), '\d+', ''))
title_category.select('TITLE', 'only_str').show(truncate = False)

#Top 20 news title:
regex_tokenizer = RegexTokenizer(inputCol = 'only_str', outputCol = 'words', pattern = '\\W')
raw_words = regex_tokenizer.transform(title_category)
raw_words.show()


remover = StopWordsRemover(inputCol = 'words', outputCol = 'filtered')
word_df = remover.transform(raw_words)
word_df.select('words', 'filtered').show(truncate = False)
indexer = StringIndexer(inputCol = 'CATEGORY', outputCol = 'categoryIndex')
feature_data = indexer.fit(word_df).transform(word_df)
feature_data.show()


cv = CountVectorizer(inputCol = 'filtered', outputCol = 'features')




コード例 #17
0
def compute(sc, topLeft, bottomRight, step, datasetPath, k, gfs):
    sqlContext = SQLContext(sc)
    data = sc.textFile(datasetPath)
    data = data.mapPartitions(lambda x: csv.reader(x))
    header = data.first()
    data = data.filter(lambda x: x != header)
    result_to_write = []
    res_computation = []
    step = check_step(topLeft, bottomRight, step)
    squares = get_squares(topLeft, bottomRight, step)
    # start computing elapsed time here
    start_time = time.time()
    data = data.map(lambda x: is_inside(x, topLeft, bottomRight, step, squares)). \
        filter(lambda x: x is not None)
    data = data.map(remove_punctuation). \
        map(split_string_into_array). \
        filter(remove_empty_array). \
        map(create_row). \
        groupByKey(). \
        map(lambda x : (x[0], list(x[1])))
    # create the dataframes
    allDf = []
    for df in data.collect():
        if df:
            allDf.append([df[0], sqlContext.createDataFrame(df[1])])

    for docDFs in allDf:
        docDF = docDFs[1]
        squareId = docDFs[0]
        StopWordsRemover.loadDefaultStopWords('english')
        newDocDF_eng = StopWordsRemover(inputCol="words", outputCol="filtered_eng"). \
            transform(docDF)
        newDocDF_eng = newDocDF_eng.drop('words')
        StopWordsRemover.loadDefaultStopWords('italian')
        newDocDF_ita = StopWordsRemover(inputCol="filtered_eng", outputCol="filtered_ita"). \
            transform(newDocDF_eng)
        newDocDF_ita = newDocDF_ita.drop('filtered_eng')
        StopWordsRemover.loadDefaultStopWords('german')
        newDocDF_ger = StopWordsRemover(inputCol="filtered_ita", outputCol="filtered_ger"). \
            transform(newDocDF_ita)
        newDocDF_ger = newDocDF_ger.drop('filtered_ita')

        model = CountVectorizer(inputCol="filtered_ger", outputCol="vectors"). \
            fit(newDocDF_ger)
        result = model.transform(newDocDF_ger)
        corpus = result.select("idd", "vectors").rdd.map(create_corpus).cache()
        # cluster the documents into the k topics using LDA
        ldaModel = LDA.train(corpus,
                             k=k,
                             maxIterations=100,
                             optimizer='online')
        vocabArray = model.vocabulary
        wordNumbers = 10  # number of words per topic
        topicIndices = sc.parallelize(
            ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))

        toBePrinted = min(len(vocabArray), wordNumbers)
        topics_final = topicIndices.map(
            lambda x: topic_render(x, toBePrinted, vocabArray)).collect()
        # compute labels
        topics_label = []
        for topic in topics_final:
            for topic_term in topic:
                if topic_term not in topics_label:
                    topics_label.append(topic_term)
                    break
        # print topics
        s = "; "
        res = "{}, {}, {}, {}, {}".format(topLeft.x, topLeft.y, bottomRight.x,
                                          bottomRight.y, s.join(topics_label))
        result_to_write.append(res)
        res_computation.append(topics_label)

    end_time = time.time()
    elapsed_time = end_time - start_time
    result_to_write.append(elapsed_time)
    to_write = sc.parallelize(result_to_write)
    # get dataset size from file name
    size = datasetPath.split('.')[0].split('_')[1]
    if gfs:
        output_folder = "/tmp/Topic_Zoomer_" + str(
            time.ctime(start_time)).replace(' ', '_').replace(':',
                                                              '-') + '_' + size
    else:
        output_folder = "Topic_Zoomer_" + str(time.ctime(start_time)).replace(
            ' ', '_').replace(':', '-') + '_' + size
    to_write.saveAsTextFile(output_folder)

    if gfs:
        copyHdfsCmd = 'hdfs dfs -copyToLocal {} {}'.format(
            output_folder, output_folder)
        copyBucketCmd = 'gsutil cp -r {} {}'.format(output_folder,
                                                    gfs_output_path_hdfs)
        copyRecBucketCmd = 'gsutil cp -r {} {}'.format(recFileFolder,
                                                       gfs_output_path_hdfs)
        copyHdfsRes = subprocess.call(shlex.split(copyHdfsCmd))
        copyBucketRes = subprocess.call(shlex.split(copyBucketCmd))
        copyRecBucketRes = subprocess.call(shlex.split(copyRecBucketCmd))
        # some exit code checks
        if copyBucketRes or copyHdfsRes or copyRecBucketRes:
            print('hdfsRes: {}'.format(copyHdfsRes))
            print('bucketResComp: {}'.format(copyBucketRes))
            print('bucketResRec: {}'.format(copyRecBucketRes))
            print('Something went wrong while copying results')
    return res_computation
コード例 #18
0
ファイル: spark.py プロジェクト: JoooostB/hva-data-scientist
    def __init__(self):
        # Convert Pandas dataframe to PySpark dataframe.
        df = sqlContext.read.format("csv").option("header", "true").load("hotel-reviews.csv")
        # df = sqlContext.createDataFrame(pandas_df)

        # Change Reviewer_Score in Sentiment value (1 <= 5.5, 0 < 5.5)
        df = df.withColumn('Reviewer_Score', fn.when(df.Reviewer_Score >= 7.0, 1).otherwise(0))
        df = df.withColumnRenamed('Reviewer_Score', 'Sentiment')

        # Concatenate the negative and positive to a single review text
        df_with_text = df.withColumn('Review_Text',
                                     fn.concat(fn.col('Negative_Review'), fn.lit(' '), fn.col('Positive_Review')))

        # Strip Dataframe to only what is necessary for sentiment analysis
        df_stripped = df_with_text.select('Negative_Review', 'Positive_Review', 'Review_Text', 'Sentiment')

        # Importing Stopwords to filter out of the reviews to exclude stopwords
        stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()

        # Configure tokenizer to extract words with only letters and save in column words
        tokenizer = RegexTokenizer().setGaps(False) \
            .setPattern("\\p{L}+") \
            .setInputCol("Review_Text") \
            .setOutputCol("words")

        # Configure stopwords filter
        sw_filter = StopWordsRemover() \
            .setStopWords(stop_words) \
            .setCaseSensitive(False) \
            .setInputCol("words") \
            .setOutputCol("filtered")

        cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2 ** 17) \
            .setInputCol("filtered") \
            .setOutputCol("tf")

        # Create Pipeline with Tokenizer, Stopwords Filter and CountVectorizer
        cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(df_stripped)

        # Configure TFIDF
        idf = IDF(). \
            setInputCol('tf'). \
            setOutputCol('tfidf')

        idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(df_stripped)

        # Split data into training, validation and testing data (60%, 30%, 10%)
        training_df, validation_df, testing_df = df_stripped.randomSplit([0.6, 0.3, 0.1], seed=0)

        # Configure LogisticRegression for analysis of the reviews
        lr = LogisticRegression(). \
            setLabelCol('Sentiment'). \
            setFeaturesCol('tfidf'). \
            setRegParam(0.0). \
            setMaxIter(100). \
            setElasticNetParam(0.)

        # Create new Pipelines for the LogisticRegression and train the model
        self.model = Pipeline(stages=[idf_pipeline, lr]).fit(training_df)

        # Calculate Score of our Model using the validation Dataframe
        self.model.transform(validation_df). \
            select(fn.expr('float(prediction = Sentiment)').alias('correct')). \
            select(fn.avg('correct')).show()

        spark = SparkSession \
            .builder \
            .appName("user_input_analysis") \
            .getOrCreate()
コード例 #19
0
    # alltags=tags_users.map(lambda x:Counter(x.tags)).reduce(lambda a,b:a+b)
    # print(alltags.most_common(10))
    #.filter(lambda x:len(x.tags)>100) # filtering to get smaller dataset

    # print(tags_users.count())
    # print(tags_users.first())

    ## Filtered for testing

    tags_users_df = sqlContext.createDataFrame(tags_users)
    print(tags_users_df.take(2))
    #
    #
    # print('Indexing strings')
    cVec = CountVectorizer(inputCol='tags',
                           outputCol="tag_features",
                           minDF=10.)
    model = cVec.fit(tags_users_df)
    td = model.transform(tags_users_df)

    with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl',
              mode='wb') as ff:
        pkl.dump(model.vocabulary, ff)

    normalizer = Normalizer(p=1.,
                            inputCol='tag_features',
                            outputCol='tags_normalized')
    tdNorm = normalizer.transform(td)
    print(tdNorm.take(5))

    tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet')
コード例 #20
0
def get_trending_news(rdd):
    if not rdd.isEmpty():
        spark = getSparkSessionInstance(rdd.context.getConf())

        df = spark.createDataFrame(rdd)

        # Append the title and summary together
        df_news_concat = df.withColumn("news_content",
                                       fn.concat_ws(" ", df.title, df.summary))

        df_punc_removed = df_news_concat.withColumn(
            "news_content_removed",
            fn.regexp_replace(df_news_concat.news_content, "\p{Punct}", ""))

        udf_remove_unicode = fn.udf(
            lambda x: x.encode("ascii", "ignore").decode("ascii"))
        df_news_content_ascii = df_punc_removed.withColumn(
            "news_content_ascii",
            udf_remove_unicode(df_punc_removed.news_content_removed))

        # insert raw data to the cassandra table
        df_news_content_ascii.select("id", "news_provider", "published", "summary", "title") \
            .write \
            .format("org.apache.spark.sql.cassandra") \
            .mode("append") \
            .options(table="travel_news_data", keyspace="news_stream_analysis") \
            .save(mode="append")

        tokenizer = Tokenizer(inputCol="news_content_ascii",
                              outputCol="content_words")
        df_tokenized_content = tokenizer.transform(df_news_content_ascii).drop(
            "news_content")

        remover = StopWordsRemover(inputCol="content_words",
                                   outputCol="filtered_words")
        stop_words = remover.loadDefaultStopWords("english")
        stop_words.extend([
            '', "travel", "trip", "submitted", "abc", "reditt", "by", "time",
            "timing", "comments", "comment", "thank", "link", "im", "thanks",
            "would", "like", "get", "good", "go", "may", "also", "going",
            "dont", "want", "see", "take", "looking", ""
        ])
        remover.setStopWords(stop_words)
        df_stop_words_removed = remover.transform(df_tokenized_content).drop(
            "content_words")

        cv = CountVectorizer(inputCol="filtered_words",
                             outputCol="rawFeatures")
        cv_model = cv.fit(df_stop_words_removed)
        df_tf_data = cv_model.transform(df_stop_words_removed)
        df_features = df_tf_data.select(
            df_tf_data.rawFeatures.alias("features"))

        def convert_term_indices_to_term(term_indices, vocab):
            terms = []
            for t in term_indices:
                terms.append(vocab[t])

            return str(terms)

        # LDA
        lda = LDA(k=5, maxIter=50, learningOffset=8192.0, learningDecay=0.50)
        model = lda.fit(df_features)
        df_topics = model.describeTopics()

        fn_term_indices_to_term = fn.udf(convert_term_indices_to_term)
        vocab_lit = fn.array(*[fn.lit(k) for k in cv_model.vocabulary])
        df_lda_result = df_topics.withColumn(
            "terms", fn_term_indices_to_term("termIndices", vocab_lit))
        df_lda_result.select("topic", "termIndices",
                             "terms").show(truncate=False)

        df_lda_result.cache()

        lda_terms = df_lda_result.select("terms").collect()
        lda_terms_list = [str(i.terms) for i in lda_terms]

        # based on model terms choose news stories
        for term_list in lda_terms_list:
            s = []
            topic_words = term_list[1:-1].split(",")
            for term in topic_words:
                term = term.split("'")[1]
                s.append(r"(^|\W)" + str(term) + r"($|\W)")
            rx = '|'.join('(?:{0})'.format(x.strip()) for x in s)
            df_results = df_news_content_ascii.filter(
                df_news_content_ascii['news_content_ascii'].rlike(rx))
            df_results = df_results.withColumn("topic_words",
                                               fn.lit(str(topic_words)[1:-1]))
            df_results = df_results.withColumn("results_date",
                                               fn.lit(datetime.datetime.now()))

            # insert results with the raw data to the cassandra table
            df_results.select("id", "news_provider", "published", "summary", "title", "topic_words", "results_date") \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table="travel_news_data_results", keyspace="news_stream_analysis") \
                .save(mode="append")
コード例 #21
0
def train_cv_model(modelDataframe):
    cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0)
    model = cv.fit(modelDataframe)
    model.write().overwrite().save("models/cvModel")
コード例 #22
0
data = data.withColumn('length', length(data['text']))
data.show()

# In[3]:
# Compare the lenght difference between ham and spam
data.groupby('class').mean().show()

# In[4]:
# Treat TF-IDF features for each text
# TF: Term Frequency
# IDF: Inverse Document Frequency
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler

tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')
final_feature = VectorAssembler(inputCols=['tf_idf', 'length'],outputCol='features')

from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages=[ham_spam_to_num,tokenizer,stopremove,count_vec,idf,final_feature])
clean_data = data_prep_pipe.fit(data).transform(data)

clean_data.show()
clean_data.take(1)
clean_data.take(1)[0][-1]

# In[4*]:
# Select features column and tansfrom to Pandas dataframe
df = clean_data.select('features').toPandas()
コード例 #23
0
ファイル: reddit_model.py プロジェクト: akshaysmit/CS143-P2B
def main(context):
    """Main function takes a Spark SQL context."""

    # the read is from the parquet file
    comments = sqlContext.read.parquet("comments-minimal.parquet")
    submissions = sqlContext.read.parquet("submissions.parquet")
    
    # only look at columns that are useful
    comments = comments.select("id","created_utc","body","author_flair_text", "link_id", "score").\
        withColumnRenamed("score", "commentscore")
    submissions = submissions.select("id", "title", "score").\
        withColumnRenamed("score", "storyscore")

    #comments.write.parquet("comments-minimal.parquet")
    #submissions.write.parquet("submissions.parquet")

    labeled_data = sqlContext.read.format("csv").options(header='true', inferSchema='true').load('labeled_data.csv')

    #here we do the join on comment id
    joined = comments.join(labeled_data, comments.id == labeled_data.Input_id)

    #sanitize_new ignores processed string given by sanitize
    from cleantext import sanitize
    def sanitize_new(text):
        r = sanitize(text)[1:]
        return r[0].split(" ")+r[1].split(" ")+r[2].split(" ")

    #create the udf, generate new column of n-grams
    sanitize_udf = udf(sanitize_new, ArrayType(StringType()))
    joined = joined.withColumn("ngrams", sanitize_udf(joined.body))

    #6a: construct feature vector based on "ngrams"
    #store the transformed column in "features"
    #CountVectroizer produces sparse vector by default so no need to change   
    cv = CountVectorizer(inputCol="ngrams", outputCol = "features",minDF=5.0, binary=True)
    cv_model = cv.fit(joined)
    joined = cv_model.transform(joined)

    #6b: construct pos column and neg column
    #for this project, only look at label on Trump
    pos_udf = udf(lambda label: 1 if label == 1 else 0 ,IntegerType())
    neg_udf = udf(lambda label: 1 if label ==-1 else 0 ,IntegerType())
    joined = joined.withColumn("poslabel", pos_udf(joined.labeldjt))
    joined = joined.withColumn("neglabel", neg_udf(joined.labeldjt))
    
    #7: train logistic regression model
    #code adopted from project spec
#     #Initialize two logistic regression models.
#     poslr = LogisticRegression(labelCol="poslabel", featuresCol="features", maxIter=10)
#     neglr = LogisticRegression(labelCol="neglabel", featuresCol="features", maxIter=10)
#     poslr.setThreshold(0.2)
#     neglr.setThreshold(0.25)
#     # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
#     posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel")
#     negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel")
#     # There are a few parameters associated with logistic regression. We do not know what they are a priori.
#     # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
#     # We will assume the parameter is 1.0. Grid search takes forever.
#     posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
#     negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
#     # We initialize a 5 fold cross-validation pipeline.
#     posCrossval = CrossValidator(
#         estimator=poslr,
#         evaluator=posEvaluator,
#         estimatorParamMaps=posParamGrid,
#         numFolds=5)
#     negCrossval = CrossValidator(
#         estimator=neglr,
#         evaluator=negEvaluator,
#         estimatorParamMaps=negParamGrid,
#         numFolds=5)
#     # Although crossvalidation creates its own train/test sets for
#     # tuning, we still need a labeled test set, because it is not
#     # accessible from the crossvalidator (argh!)
#     # Split the data 50/50
#     posTrain, posTest = joined.randomSplit([0.5, 0.5])
#     negTrain, negTest = joined.randomSplit([0.5, 0.5])

#     # Train the models
#     print("Training positive classifier...")
#     posModel = posCrossval.fit(posTrain)
#     print("Training negative classifier...")
#     negModel = negCrossval.fit(negTrain)

#     # save the models
#     posModel.save("www/pos.model")
#     negModel.save("www/neg.model")

    #load instead
    posModel = CrossValidatorModel.load("www/pos.model")
    negModel = CrossValidatorModel.load("www/neg.model")
    print("finished loading model")

    #8.2 title of submission of the comment
    comments = comments.withColumn("clean_id", regexp_replace("link_id", r'^t3_', ''))
    comments = comments.join(submissions, comments.clean_id == submissions.id).drop(submissions.id)
    
    #9 
    #filter out comments with "\s" and starts with "&gt"
    comments = comments.filter(~comments.body.rlike(r'^&gt')).\
        filter(~comments.body.rlike(r'\\s'))
    #sample
    comments = comments.sample(False, sampleRate, None) # 1 serves as the seed so model is reproducible
    #redo 4,5,6a 
    comments = comments.withColumn("ngrams", sanitize_udf(comments.body))
    comments = cv_model.transform(comments)
    print("done with transforming the sampled comments")

    #make predictions
    comments = posModel.transform(comments).\
        drop("body", "link_id", "clean_id", "ngrams","rawPrediction", "probability").\
        withColumnRenamed("prediction", "poslabel")
    comments = negModel.transform(comments).drop("features", "rawPrediction", "probability").\
        withColumnRenamed("prediction", "neglabel")

    #10
    #1. compute the percentage of positive, negative comments 
    print("Percentage of positive comments")
    result = comments.select('poslabel').groupBy().avg()
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("pos-perc.csv")
    print("Percenetage of negative comments")
    result = comments.select('neglabel').groupBy().avg()
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("neg-perc.csv")

    #2. by date
    comments = comments.withColumn("date", from_unixtime(comments.created_utc, "YYYY-MM-dd"))
    result = comments.groupBy("date").agg({"poslabel" : "mean", "neglabel" : "mean"})
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("time_data.csv")

    #3. by state
    val_state_udf = udf(lambda state: state if state in states else None, StringType())
    comments = comments.withColumn("state", val_state_udf(lower(comments.author_flair_text)))
    comments = comments.filter(comments.state.isNotNull())
    result = comments.groupBy("state").agg({"poslabel" : "mean", "neglabel" : "mean"})
    result.show(truncate=False)
    print(result.count())
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("state_data.csv")
    
    #4a. by comment score
    result = comments.groupBy("commentscore").agg({"poslabel" : "mean", "neglabel" : "mean"})
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("comment_score.csv")
    
    #4b. by story score
    result = comments.groupBy("storyscore").agg({"poslabel" : "mean", "neglabel" : "mean"})
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("story_score.csv")
コード例 #24
0
def main(inputs):

    amazon_schema = types.StructType([
        types.StructField('marketplace', types.StringType()),
        types.StructField('customer_id', types.IntegerType()),
        types.StructField('review_id', types.StringType()),
        types.StructField('product_id', types.StringType()),
        types.StructField('product_parent', types.LongType()),
        types.StructField('product_title', types.StringType()),
        types.StructField('product_category', types.StringType()),
        types.StructField('star_rating', types.IntegerType()),
        types.StructField('helpful_votes', types.IntegerType()),
        types.StructField('total_votes', types.IntegerType()),
        types.StructField('vine', types.StringType()),
        types.StructField('verified_purchase', types.StringType()),
        types.StructField('review_headline', types.StringType()),
        types.StructField('review_body', types.StringType()),
        types.StructField('review_date', types.DateType())
    ])

    input_df = spark.read.parquet(inputs)
    input_df = input_df.repartition(96)
    #input_df.show()
    #print("No of rows in input dataset:",inputs," is:",input_df.count())
    StopWords = stopwords.words("english")
    start_time = time.time()

    tokens = input_df.rdd.map(lambda x: x['review_headline'])\
    .filter(lambda x: x is not None)\
    .map( lambda document: document.strip().lower())\
    .map( lambda document: re.split(" ", document))\
    .map( lambda word: [x for x in word if x.isalpha()])\
    .map( lambda word: [x for x in word if len(x) > 3] )\
    .map( lambda word: [x for x in word if x not in StopWords])\
    .zipWithIndex()

    df_txts = spark.createDataFrame(tokens, ["list_of_words", 'index'])

    # TF
    cv = CountVectorizer(inputCol="list_of_words",
                         outputCol="raw_features",
                         vocabSize=5000,
                         minDF=10.0)
    cvmodel = cv.fit(df_txts)
    result_cv = cvmodel.transform(df_txts)

    # IDF
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv)

    #result_tfidf.show()

    num_topics = 10
    max_iterations = 100
    lda = LDA(k=num_topics, maxIter=max_iterations)
    lda_model = lda.fit(result_tfidf.select('index', 'features'))

    wordNumbers = 5
    #topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = wordNumbers))

    topics = lda_model.describeTopics(maxTermsPerTopic=wordNumbers)
    topics.show(truncate=False)
コード例 #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--docs_path', default='data/wiki-sample/AA')
    parser.add_argument('-p', '--prepro_path', default='data/prepro')
    parser.add_argument('-q',
                        '--queries_path',
                        default='data/queries/sample.json')
    parser.add_argument('-o', '--output_path', default='data/output')
    parser.add_argument('-m',
                        '--mode',
                        choices=['prepro', 'fit', 'query'],
                        default='prepro')
    parser.add_argument('-dl', '--docs_limit', type=int)
    parser.add_argument('-ql', '--queries_limit', type=int)
    parser.add_argument('-il',
                        '--inverted_index_limit',
                        type=int,
                        default=5000)
    args = parser.parse_args()
    print('Running BigramPipeline with args: {}'.format(args))

    spark = SparkSession.builder.appName('BigramModel').getOrCreate()

    tokenIdsUdf = udf(lambda x: x.indices.tolist(), ArrayType(IntegerType()))
    tfIdfModelPath = os.path.join(args.prepro_path, 'tf_idf_model')
    docsTfIdfPath = os.path.join(args.prepro_path, 'docs_tf_idf')
    docsTokenIdsPath = os.path.join(args.prepro_path, 'docs_token_ids')
    docsBigramsPath = os.path.join(args.prepro_path, 'docs_bigrams')

    parser = WikiParser(inputCol='text',
                        outputCol='text_parsed',
                        minParagraphs=1,
                        minCharacters=500)
    tokenizer = Tokenizer(inputCol='text_parsed', outputCol='unigrams')
    ngrams = NGram(inputCol='unigrams', outputCol='bigrams', n=2)
    concat = Concat(inputCols=['unigrams', 'bigrams'], outputCol='tokens')

    if args.mode == 'prepro':
        spark.sparkContext.setJobGroup('input', 'Read input data')
        docs = spark.read.json(args.docs_path)
        if args.docs_limit is not None:
            docs = docs.limit(args.docs_limit)

        spark.sparkContext.setJobGroup('parse_docs', 'Parse wiki documents')
        docsParsed = parser.transform(docs)
        docsParsed = checkpoint(spark, docsParsed,
                                os.path.join(args.prepro_path, 'docs_parsed'))

        spark.sparkContext.setJobGroup('tokenize', 'Tokenize documents')
        docsTokenized = tokenizer.transform(docsParsed)
        docsTokenized = checkpoint(
            spark, docsTokenized,
            os.path.join(args.prepro_path, 'docs_tokenized'))

        spark.sparkContext.setJobGroup('ngrams', 'Compute bigrams')
        docsBigrams = ngrams.transform(docsTokenized)
        docsBigrams = concat.transform(docsBigrams)
        docsBigrams.write.parquet(docsBigramsPath)
    elif args.mode == 'fit':
        spark.sparkContext.setJobGroup('input', 'Read input data')
        docsBigrams = spark.read.parquet(docsBigramsPath).select(
            'id', 'tokens')
        tf = CountVectorizer(inputCol='tokens',
                             outputCol='tf',
                             vocabSize=10000000,
                             minDF=2.0,
                             minTF=3.0)
        idf = IDF(inputCol='tf', outputCol='idf')

        spark.sparkContext.setJobGroup('tf', 'Fit TF model')
        tfModel = tf.fit(docsBigrams)
        docsTf = tfModel.transform(docsBigrams)
        docsTf = checkpoint(spark, docsTf,
                            os.path.join(args.prepro_path, 'docs_tf'))

        spark.sparkContext.setJobGroup('idf', 'Fit IDF model')
        idfModel = idf.fit(docsTf)
        docsTfIdf = idfModel.transform(docsTf)
        docsTfIdf = docsTfIdf.select(docsTfIdf.id.alias('doc_id'),
                                     docsTfIdf.idf.alias('doc_idf'))
        docsTfIdf = checkpoint(spark, docsTfIdf, docsTfIdfPath)
        tfIdfModel = PipelineModel(
            stages=[tokenizer, ngrams, concat, tfModel, idfModel])
        tfIdfModel.save(tfIdfModelPath)

        spark.sparkContext.setJobGroup('docs_token_ids',
                                       'Compute inverted index')
        docsTokenIds = docsTfIdf.select(
            docsTfIdf.doc_id,
            explode(tokenIdsUdf(docsTfIdf.doc_idf)).alias('token_id'))
        docsTokenIds.write.parquet(docsTokenIdsPath)
    elif args.mode == 'query':
        assert args.queries_path is not None

        spark.sparkContext.setJobGroup('input', 'Read input data')
        tfIdfModel = PipelineModel.load(tfIdfModelPath)
        docsTfIdf = spark.read.parquet(docsTfIdfPath)
        docsTokenIds = spark.read.parquet(docsTokenIdsPath)
        queries = spark.read.json(args.queries_path)
        if args.queries_limit is not None:
            queries = queries.limit(args.queries_limit)
        queries = queries.select(queries._id.alias('query_id'),
                                 queries.question.alias('text_parsed'))

        spark.sparkContext.setJobGroup('queries_tf_idf',
                                       'Apply TF-IDF to queries')
        queriesTfIdf = tfIdfModel.transform(queries)
        queriesTfIdf = queriesTfIdf.select(queriesTfIdf.query_id,
                                           queriesTfIdf.tf.alias('query_tf'))
        queriesTfIdf = checkpoint(
            spark, queriesTfIdf,
            os.path.join(args.output_path, 'queries_tf_idf'))
        print('Finished query TF IDF')

        spark.sparkContext.setJobGroup('queries_token_ids',
                                       'Compute query token IDs')
        queriesTokenIds = queriesTfIdf.select(
            queriesTfIdf.query_id,
            explode(tokenIdsUdf(queriesTfIdf.query_tf)).alias('token_id'))
        queriesTokenIds = checkpoint(
            spark, queriesTokenIds,
            os.path.join(args.output_path, 'queries_token_ids'))
        print('Finished query token IDs')

        spark.sparkContext.setJobGroup('doc_queries',
                                       'Perform inverted index filtering')
        docQueries = docsTokenIds.join(queriesTokenIds, on='token_id').groupby(
            'query_id', 'doc_id').count()
        window = Window.partitionBy(docQueries.query_id).orderBy(
            col('count').desc())
        docQueries = docQueries.withColumn('rank', row_number().over(window)) \
                        .filter(col('rank') <= args.inverted_index_limit) \
                        .select('query_id', 'doc_id')
        docQueries = checkpoint(spark, docQueries,
                                os.path.join(args.output_path, 'doc_queries'))
        print('Finished inverted index filter')

        spark.sparkContext.setJobGroup('score', 'Perform scoring')
        docQueries = docQueries.join(docsTfIdf, on='doc_id').join(queriesTfIdf, on='query_id') \
                        .select('query_id', 'doc_id', 'query_tf', 'doc_idf')
        docQueries = Dot(inputCols=['doc_idf', 'query_tf'],
                         outputCol='score').transform(docQueries)
        queryResults = docQueries.select('query_id', 'doc_id', 'score')
        queryResults.write.parquet(
            os.path.join(args.output_path, 'query_results'))
        print('Wrote output to {}'.format(args.output_path))

    spark.stop()
コード例 #26
0
#path para os dados de teste e treino (no mesmo diretorio do codigo)
pathTrain = ["dataset_train.csv"]
pathTest = ["dataset_test.csv"]

#preparacao dos dataframes de treino e teste
data_treino = spark.read.load(pathTrain, format="csv",
                              header=True)  #dataset treino
data_test = spark.read.load(pathTest, format="csv", header=True)  #dataset test
print("	Dados de treino")
data_treino.select("*").show()

#declacao de stopwords, tokenizacao, idf, formacao do vocabulario
tk = Tokenizer(inputCol="Conteudo", outputCol="tokens")
swr = StopWordsRemover(inputCol="tokens", outputCol="words")
cv = CountVectorizer(inputCol="words",
                     outputCol="rawFeatures",
                     vocabSize=100000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

#pipeline dos processos declarados para os dados de teste e treino
pipeline = Pipeline(stages=[tk, swr, cv, idf])
model_pipe = pipeline.fit(data_treino)
data_treino = model_pipe.transform(data_treino)

model_pipe = pipeline.fit(data_test)
data_test = model_pipe.transform(data_test)

#Geracao do modelo e teste
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model = mh.fit(data_treino)
data_treino = model.transform(data_treino)
コード例 #27
0
  .select(F.col('date').cast('date'), 'note', F.col('duration').cast('int'))
maintenance.show(5, truncate=False)

# ### Sample of 2-word nGrams on Maintenance Notes
tk = Tokenizer(inputCol="note", outputCol="words") # Tokenize
maintTokenized = tk.transform(maintenance)
swr = StopWordsRemover(inputCol="words", outputCol="filtered") # Remove stop-words
maintFiltered = swr.transform(maintTokenized)
ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams") # 2-word nGrams
maintNGrams = ngram.transform(maintFiltered)
maintNGrams.select('ngrams').show(5, truncate=False)

# ### Topic Clustering using Latent Dirichlet Allocation (LDA)
# LDA is a form of un-supervised machine learning that identifies clusters, or topics,
# in the data
cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=50)\
  .fit(maintNGrams) # CountVectorize converts nGram array into a vector of counts
maintVectors = cv.transform(maintNGrams)
vocabArray = cv.vocabulary
lda = LDA(k=3, maxIter=10)
ldaModel = lda.fit(maintVectors)

ldaModel.write().overwrite().save('lda.mdl')

topics = ldaModel.describeTopics(5)
# We see below that each maintenance log can be clustered based on its text into 
# 1 of 3 topics below. The nGrams in each cluster show clearly 3 types of maintenance
# activities
# 1. Preventive maintenance occurs when the we have 'abnormal readings' or a 'component replacement'
# 2. Corrective maintenance occurs when we have a 'asset shutdown' event or 'asset failure'
# 3. The rest of the logs indicate that no downtime is required (ie. 'maintenance tests passed', 'asset healthy')
for topic in topics.collect():
コード例 #28
0
# limitations under the License.
#

from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import CountVectorizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession.builder.appName(
        "CountVectorizerExample").getOrCreate()

    # $example on$
    # Input data: Each row is a bag of words with a ID.
    df = spark.createDataFrame([(0, "a b c".split(" ")),
                                (1, "a b b c a".split(" "))], ["id", "words"])

    # fit a CountVectorizerModel from the corpus.
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         vocabSize=3,
                         minDF=2.0)
    model = cv.fit(df)
    result = model.transform(df)
    result.show()
    # $example off$

    spark.stop()
コード例 #29
0
    param: Takes in 
    '''
    text = record[3]  # The 3rd column corresponds to the review text
    words = text.split()
    return words


udf_formattext = udf(cleanup_text_format, ArrayType(StringType()))
clean_text = reviews.withColumn(
    "reviewTextArray",
    udf_formattext(struct([reviews[x] for x in reviews.columns])))

# Count Vectorizor Convert a collection of text documents to vectors of token counts
cv = CountVectorizer(inputCol="reviewTextArray",
                     outputCol="rawFeatures",
                     vocabSize=1000)
cvmodel = cv.fit(clean_text)
featurizedData = cvmodel.transform(clean_text)

vocab = cvmodel.vocabulary
vocab_broadcast = sc.broadcast(vocab)


def map_termID_to_Word(termIndices):
    ''' Map each term index back to its original word
    param (list of integers): Each element correponds to a word represented by an integer
    returns (list of str): Returns the words which are converted from their respective strings
    '''
    return [vocab_broadcast.value[termID] for termID in termIndices]