# ## Load and prep data # # * Load the full data set # * Load the picked k=100 approx Nearest Neighbor results # * Build song recommdations based on songs in nearest playlist # In[3]: mpd_all = mpd.load(spark, "onebig", 1) # Get the ranked popularity of songs in the mpd. # In[32]: cv = CountVectorizer(inputCol="track_uri", outputCol="features", minDF=2, vocabSize=2000000) # In[33]: model = cv.fit(mpd_all.select("pid", "tracks.track_uri")) # In[35]: result = model.transform(mpd_all.select("pid", "tracks.track_uri")) # In[36]: #model, result = mpd.vectorizecol(mpd_all.select("pid", "tracks.track_uri"), "track_uri", "features", 2000000) # In[37]:
This portion of the code creates topics and associated words using Latent Dirichlet Allocation @author: [email protected] """ from pyspark.sql import SQLContext, Row from pyspark.ml.feature import CountVectorizer from pyspark.mllib.clustering import LDA, LDAModel sqlContext = SQLContext(sc) path = "./advisorconversations/advsisortext.txt" data = sc.textFile(path).zipWithIndex().map( lambda (words, idd): Row(idd=idd, words=words.split(" "))) docDF = sqlContext.createDataFrame(data) Vector = CountVectorizer(inputCol="words", outputCol="vectors") model = Vector.fit(docDF) result = model.transform(docDF) corpus_size = result.count() # total number of words corpus = result.select("idd", "vectors").map(lambda (x, y): [x, y]).cache() # Cluster the documents into four topics using LDA ldaModel = LDA.train(corpus, k=4, maxIterations=100, optimizer='online') topics = ldaModel.topicsMatrix() vocabArray = model.vocabulary wordNumbers = 50 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
################################################## ################## Transofrmers ################## ################################################## # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W") # stop words add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the"] # standard stop words stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_stringIdx = StringIndexer(inputCol="Category", outputCol="label") transformers = [ regexTokenizer, stopwordsRemover, countVectors, label_stringIdx ] pipeline = Pipeline(stages=transformers) pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) testData.show(5)
sc.brao def remove_low_freq_words(words): # print(words) rst = list(filter(lambda x: x in high_freq_words, words)) # print(rst) return rst remove_low_freq_words = udf(remove_low_freq_words, ArrayType(StringType())) df = df.withColumn('high_freq_words', remove_low_freq_words(col('context_words'))) cv = CountVectorizer(inputCol="high_freq_words", outputCol="words_features", vocabSize=len(high_freq_words)) model = cv.fit(df) df = model.transform(df) kmeans = KMeans(featuresCol="words_features", predictionCol="kmeans_prediction").setK(100).setSeed(1) model = kmeans.fit(df) predictions = model.transform(df) predictions.select(['context_words', 'high_freq_words', 'kmeans_prediction']).show(100, truncate=False) ################################################################# # 按线程分组统计
def main(review_table,business_table,output_folder): #Read reviews and business data review_df = spark.read.parquet(review_table) review_df.createOrReplaceTempView("reviews_table") business_df = spark.read.parquet(business_table) business_toronto=business_df.filter(business_df.City=="Toronto") business_toronto.createOrReplaceTempView("business_table") #collect reviews for each business business_review=spark.sql( """ SELECT BusinessID, collect_set(Review) AS total_review FROM reviews_table GROUP BY BusinessID """ ) #convert reviews in string format merge_review = udf(lambda total_review: (" ").join(total_review)) business_concat_review=business_review.withColumn("comb_review", merge_review(business_review['total_review'])).drop(business_review['total_review']) business_concat_review.createOrReplaceTempView("comb_review_table") #Keep reviews for business in toronto Reviews_for_business=spark.sql(""" SELECT c.BusinessID,b.Name AS BusinessName,b.BusinessStars,c.comb_review FROM comb_review_table AS c INNER JOIN business_table AS b ON c.BusinessID=b.BusinessID """) #pipleine to preprocess text data regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'comb_review', outputCol = 'token') stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'no_stopword') countVectorizer = CountVectorizer(inputCol="no_stopword", outputCol="rawcol") TDF = IDF(inputCol="rawcol", outputCol="idf_vec") text_pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, TDF]) IDF_model = text_pipeline.fit(Reviews_for_business) #IDF_model.write().overwrite().save('IDF_model1') #collect the vacabulary from text from count vectorizer model vocab=IDF_model.stages[2].vocabulary business_review_df=IDF_model.transform(Reviews_for_business) #two business categories base on low and high star rating reviews_low=business_review_df.where(business_review_df.BusinessStars<=3) reviews_high=business_review_df.where(business_review_df.BusinessStars>3) lda = LDA(k=6, seed=123, optimizer='online', featuresCol="idf_vec") vocab_word = udf(lambda termIndices: [vocab[idx] for idx in termIndices]) #topic modelling on low rating business lowtopic_model = lda.fit(reviews_low) lowtopic_transform=lowtopic_model.transform(reviews_low) print("topic distribution for low rating business") lowtopic_transform.select('BusinessID','BusinessName','topicDistribution').show(4,False) #lowtopic_model.write().overwrite().save('lowtopic_model') #topic distribution low_dist=lowtopic_transform.withColumn('topic_distribution',lowtopic_transform['topicDistribution'].cast('string')).drop('topicDistribution') low_dist_df=low_dist.select('BusinessID','BusinessName','topic_distribution') low_dist_df.write.csv(output_folder + '/Topic_low_business_topic_dist',header=True) #key topics lowreview_topics=lowtopic_model.describeTopics() lowreview_topics_concat=lowreview_topics.withColumn("topic_word", vocab_word(lowreview_topics['termIndices'])) low_df=lowreview_topics_concat.select('topic','topic_word') print("Topics for low rating business") low_df.show(6,False) low_df.coalesce(1).write.csv(output_folder + '/Topic_low_rating_topic',header=True) #topic modelling on high rating business high_topic_model = lda.fit(reviews_high) hightopic_transform=high_topic_model.transform(reviews_high) print("topic distribution for high rating business") hightopic_transform.select('BusinessID','BusinessName','topicDistribution').show(4,False) #high_topic_model.write().overwrite().save('high_topic_model') #topic distribution high_dist=hightopic_transform.withColumn('topic_distribution',hightopic_transform['topicDistribution'].cast('string')).drop('topicDistribution') high_dist_df=high_dist.select('BusinessID','BusinessName','topic_distribution') high_dist_df.write.csv(output_folder + '/Topic_high_business_topic_dist',header=True) #key topic highreview_topics=high_topic_model.describeTopics() highreview_topics_concat=highreview_topics.withColumn("topic_word", vocab_word(highreview_topics['termIndices'])) high_df=highreview_topics_concat.select('topic','topic_word') print("\nTopics for high rating business") high_df.show(6,False) high_df.coalesce(1).write.csv(output_folder + '/Topic_high_rating_topic',header=True)
train.groupby('final_status').count().show() ''' +------------+-----+ |final_status|count| +------------+-----+ | 0|69629| #0.679 | 1|32853| #0.320 +------------+-----+ ''' #Text columns #desc,keywords [TEXT] train_test=train_test.withColumn('keyword_features',split(col('keywords'),'-')) cv = CountVectorizer(inputCol="keyword_features", outputCol="keyword_features_cv") model=cv.fit(train_test) train_test = model.transform(train_test) train_test.show(truncate=False) train.columns #['project_id', 'name', 'desc', 'goal', 'keywords', 'disable_communication_encoded', 'country', 'currency', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'final_status', 'countryindexed', 'country_features', 'currencyindexed', 'currency_features', 'keyword_features', 'keyword_features_cv', 'diff_statechange_deadline', 'diff_created_deadline', 'diff_launched_deadline', 'diff_statechange_launched'] train_test=train_test.withColumn('diff_statechange_deadline',(train_test.state_changed_at-train_test.deadline)/86400) train_test=train_test.withColumn('diff_created_deadline',(train_test.deadline-train_test.created_at)/86400) train_test=train_test.withColumn('diff_launched_deadline',(train_test.deadline-train_test.launched_at)/86400) train_test=train_test.withColumn('diff_statechange_launched',(train_test.state_changed_at-train_test.launched_at)/86400)
def main(context): """Main function takes a Spark SQL context.""" # TASK 1: load data # Read from original source files comments = context.read.json("comments-minimal.json.bz2") submissions = context.read.json("submissions.json.bz2") labels = context.read.csv("labeled_data.csv", header=True) ''' # Write to parquet files comments.write.parquet("comments.parquet") submissions.write.parquet("submissions.parquet") labels.write.parquet("labels.parquet") # Read from parquet files comments = context.read.parquet("comments.parquet") submissions = context.read.parquet("submissions.parquet") labels= context.read.parquet("labels.parquet") ''' comments = comments.select("id", "body", "created_utc", "author_flair_text", "link_id", col("score").alias("c_score")) submissions = submissions.select("title", "id", col("score").alias("s_score")) labels = labels.select("Input_id", "labeldjt") # TASK 2: join labeled_data with comments_minimal comments_labels = labels.join(comments, labels.Input_id == comments.id).select( "id", "body", "created_utc", "author_flair_text", "link_id", "labeldjt") # TASK 4, 5: Generate unigrams, bigrams, and trigrams for each comment in the labeled data, # store all of them into one column and split them by words. sanitize_udf = udf(sanitize, ArrayType(StringType())) split_udf = udf(split_arr_to_word, ArrayType(StringType())) sanitized_table = comments_labels.select("id", "labeldjt", \ split_udf(sanitize_udf("body")).alias("sanitized_text")) # TASK 6A: Turn raw features into a sparse feature vector. Only use tokens that appear more than 10 times. cv = CountVectorizer(minDF=10.0, inputCol="sanitized_text", outputCol="vectors") cv_table = cv.fit(sanitized_table) vec_table = cv_table.transform(sanitized_table) # TASK 6B: Add columns for positive and negative labels final = vec_table.withColumn("positive", F.when(vec_table.labeldjt == 1, 1).otherwise(0))\ .withColumn("negative", F.when(vec_table.labeldjt == -1, 1).otherwise(0)) pos = final.select(col("id"), col("vectors").alias("features"), col("positive").alias("label")) neg = final.select(col("id"), col("vectors").alias("features"), col("negative").alias("label")) # TASK 7: Initialize two logistic regression models. # Code to generate the models: # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) ''' # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("project2/pos.model") negModel.save("project2/neg.model") # To load saved models: posModel = CrossValidatorModel.load("project2/pos.model") negModel = CrossValidatorModel.load("project2/neg.model") ''' # TASK 8: read more parts of comments fix_link_udf = udf(remove_first_three, StringType()) comments_fixed = comments.select( col("id").alias("comment_id"), fix_link_udf("link_id").alias("link_id"), "created_utc", "body", col("author_flair_text").alias("state"), "c_score") new_table, = submissions.join( comments_fixed, comments_fixed.link_id == submissions.id).randomSplit([0.2]) # TASK 9: # remove any comments that contain '\s' or '>' new_table = new_table.filter(~new_table.body.contains(">") & ~new_table.body.contains("\s")) # Repeat task 4 and 5 and 6A sanitized_new_table = new_table.select("link_id", "state", "comment_id", "body", "created_utc", \ "title", split_udf(sanitize_udf("body")).alias("sanitized_text"), "s_score", "c_score") final_table = cv_table.transform(sanitized_new_table) # Run the models ith = udf(ith_, FloatType()) task9_table = final_table.select("link_id", "state", "comment_id", "body", "created_utc", "title", "c_score", "s_score", "sanitized_text", col("vectors").alias("features")) task9_table = posModel.transform(task9_table) task9_table = task9_table.withColumn( "pos", F.when(ith(task9_table.probability, lit(1)) > 0.2, 1).otherwise(0)).select("link_id", "state", "comment_id", "body", "c_score", "s_score", "created_utc", "title", "features", "pos") task9_table = negModel.transform(task9_table) task9_table = task9_table.withColumn( "neg", F.when(ith(task9_table.probability, lit(1)) > 0.25, 1).otherwise(0)).select("link_id", "state", "comment_id", "body", "c_score", "s_score", "created_utc", "title", "pos", "neg") # TASK 10: calculate statistics # Part 1: part1 = task9_table.groupBy().agg( F.avg("pos").alias("pos"), F.avg("neg").alias("neg")) # Part 2: part2 = task9_table.groupBy(from_unixtime("created_utc", "yyyy-MM-dd")).agg( F.avg("pos").alias("pos"), F.avg("neg").alias("neg")) # Part 3: states = [ 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming' ] part3 = task9_table.where(col("state").isin(states)).groupBy("state").agg( F.avg("pos").alias("pos"), F.avg("neg").alias("neg")) # Part 4: part4 = task9_table.groupBy("title").agg( F.avg("pos").alias("pos"), F.avg("neg").alias("neg")) by_c_score = task9_table.groupBy("c_score").agg( F.avg("pos").alias("pos"), F.avg("neg").alias("neg")) by_s_score = task9_table.groupBy("s_score").agg( F.avg("pos").alias("pos"), F.avg("neg").alias("neg")) # Save the data part1.repartition(1).write.format("com.databricks.spark.csv").save( "part1.csv") part2.repartition(1).write.format("com.databricks.spark.csv").save( "part2.csv") part3.repartition(1).write.format("com.databricks.spark.csv").save( "part3.csv") by_c_score.repartition(1).write.format("com.databricks.spark.csv").save( "c_score.csv") by_s_score.repartition(1).write.format("com.databricks.spark.csv").save( "s_score.csv") part4.repartition(1).write.format("com.databricks.spark.csv").save( "part4.csv") part5.repartition(1).write.format("com.databricks.spark.csv").save( "part5.csv") # Part 5: calculate percentage of positive and negative comments by month time = context.read.csv("part2.csv") time = time.withColumn("pos", time["_c1"].cast(FloatType()))\ .withColumn("neg", time["_c2"].cast(FloatType()))\ .drop("_c1").drop("_c2").na.drop() part5 = time.groupBy(F.month(time._c0), F.year(time._c0)).agg( F.avg("pos").alias("pos"), F.avg("neg").alias("neg")) # for plot 4 part4.sort(col("pos").desc()).limit(10).repartition(1).write.format( "com.databricks.spark.csv").save("plot4_pos.csv") part4.sort(col("neg").desc()).limit(10).repartition(1).write.format( "com.databricks.spark.csv").save("plot4_neg.csv")
out_vec.append(t_stem) return out_vec # Create user defined function for stemming with return type Array<String> stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType())) # Create new df with vectors containing the stemmed tokens # Create new df with vectors containing the stemmed tokens vector_stemmed_df = ( cleaned .withColumn("vector_stemmed", stemmer_udf("words")) ) # vectorize cv = CountVectorizer(inputCol="vector_stemmed", outputCol="vectors") print 'done' count_vectorizer_model = cv.fit(vector_stemmed_df) print 'done' result = count_vectorizer_model.transform(vector_stemmed_df) corpus = result.select(F.col('id').cast("long"), 'vectors').rdd \ .map(lambda x: [x[0], x[1]]) # Runnign LDA after processing the data lda_model = LDA.train(rdd=corpus, k=5, seed=12, maxIterations=50) # extracting topics topics = lda_model.describeTopics(maxTermsPerTopic=10) # extraction vocabulary vocabulary = count_vectorizer_model.vocabulary
ngram.transform(data_token).select('bigram').show(truncate = False) # Tfidf from pyspark.ml.feature import HashingTF, IDF hasing_tf = HashingTF(inputCol = 'tokens', outputCol = 'rawfeatures') data_token = hasing_tf.transform(data_token) idf = IDF(inputCol = 'rawfeatures', outputCol = 'features') idf_model = idf.fit(data_token) data_tfidf = idf_model.transform(data_token) data_tfidf.show() # CountVectorizer from pyspark.ml.feature import CountVectorizer cv = CountVectorizer(inputCol = 'token', outputCol = 'features', vocabSize, minDF = 2) result = cv.fit(data_token).transform(data_token) result.show() ############# model fitting tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token') remover = StopWordsRemover(inputCol = 'token', outputCol = 'token_stop') cv = CountVectorizer(inputCol = 'token_stop', outputCol = 'token_cv') idf = IDF(inputCol = 'token_cv', outputCol = 'token_tfidf') assembler = VectorAssembler(inputCol = ['token_tfidf', 'length'], outputCol = 'features') indexer_y = StringIndexer(inputCol = 'Survived', outputCol = 'label') from pyspark.ml.classification import NaiveBayes nb = NaiveBayes()
def main(sqlContext): """Main function takes a Spark SQL context.""" # YOUR CODE HERE comments = sqlContext.read.json("comments-minimal.json.bz2") submissions = sqlContext.read.json("submissions.json.bz2") label = sqlContext.read.csv('labeled_data.csv', header=True, inferSchema=True) # comments.write.parquet("comments.parquet") # submissions.write.parquet("submissions.parquet") # label.write.parquet("label.parquet") # comments = sqlContext.read.parquet("comments.parquet") # comments.createOrReplaceTempView("comments") # submissions = sqlContext.read.parquet("submissions.parquet") # submissions.createOrReplaceTempView("submissions") # label = sqlContext.read.parquet("label.parquet") # label.createOrReplaceTempView("label") # task 2 df2 = sqlContext.sql( '''SELECT DISTINCT(label.Input_id),comments.*, label.labeldem, label.labelgop, label.labeldjt FROM label INNER JOIN comments ON label.Input_id=comments.id ''' ) # Question 1: # Input_id -> (labeldem, labelgop, labeldjt) # Question 2: # The data frame is not normalized, with redundant data. Given a comment id, we can uniquely identify the body of # the comment. Given the author URL, we can find the can_gild status. Given author URL and subreddit_id, we can find # author flair text and its css class string. We can decompose it into 4NF after identifying some functional # dependencies. We believe the collector of the data stored it in this way because it is easier to update one table # at a time than to update multiple ones. Also, a user status might change in future time, so the collector collected # snapshot of the comments with the most current information. # task 4 and 5 sanitize_udf = udf(cleantext.sanitize, ArrayType(StringType())) df4 = df2.withColumn('sanitized', sanitize_udf('body')) concat_udf = udf(concat_string_array, ArrayType(StringType())) df5 = df4.withColumn('ngram', concat_udf('sanitized')) df5 = df5.drop('sanitized') # 1779 # df5.write.parquet("df5.parquet") # task 6A # df5 = sqlContext.read.parquet("df5.parquet") cv = CountVectorizer(inputCol="ngram", outputCol="features", minDF=6) model = cv.fit(df5) # model.save('cv.model') result = model.transform(df5) # result.show(truncate=False) # task 6B result.createOrReplaceTempView("result") df6_positive = sqlContext.sql( "SELECT *, IF(labeldjt==1,1,0) AS label FROM result") df6_negative = sqlContext.sql( "SELECT *, IF(labeldjt==-1,1,0) AS label FROM result") # task 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.4) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = df6_positive.randomSplit([0.5, 0.5]) negTrain, negTest = df6_negative.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. # posModel.save("pos.model") # negModel.save("neg.model") # task 8 ss = submissions.sample(False, 0.2, None) df8 = comments.join(ss, comments.link_id.substr(4, 12) == ss.id).select( comments.created_utc, comments.score.alias('cscore'), ss.score.alias('sscore'), ss.title, ss.locked, ss.over_18, comments.author_flair_text, comments.id, comments.body) # Question 3 # df8.explain() # == Physical Plan == # *(2) Project [created_utc#10L, score#20L AS cscore#1619L, score#92L AS sscore#1620L, title#106, locked#76, over_18#83, author_flair_text#3, id#14, body#4] # +- *(2) BroadcastHashJoin [substring(link_id#16, 4, 12)], [id#69], Inner, BuildRight # :- *(2) Project [author_flair_text#3, body#4, created_utc#10L, id#14, link_id#16, score#20L] # : +- *(2) Filter isnotnull(link_id#16) # : +- *(2) FileScan parquet [author_flair_text#3,body#4,created_utc#10L,id#14,link_id#16,score#20L] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/media/sf_vm-shared/comments.parquet], PartitionFilters: [], PushedFilters: [IsNotNull(link_id)], ReadSchema: struct<author_flair_text:string,body:string,created_utc:bigint,id:string,link_id:string,score:big... # +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false])) # +- *(1) Filter isnotnull(id#69) # +- *(1) Sample 0.0, 0.2, false, 1565934737914995123 # +- *(1) FileScan parquet [id#69,locked#76,over_18#83,score#92L,title#106] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/media/sf_vm-shared/submissions.parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:string,locked:boolean,over_18:boolean,score:bigint,title:string> # df8 = comments.join(submissions, comments.link_id.substr(4, 12) == submissions.id)\ # .select(comments.created_utc, comments.score.alias('cscore'), submissions.score.alias('sscore'), submissions.title, comments.author_flair_text, comments.id, comments.body) # df8.write.parquet("df8.parquet") # task 9 # model = CountVectorizerModel.load('cv.model') # posModel = CrossValidatorModel.load("pos.model") # negModel = CrossValidatorModel.load("neg.model") df92 = df8.filter("body NOT LIKE '%/s%'").filter("body NOT LIKE '>%'") df94 = df92.withColumn('sanitized', sanitize_udf('body')) df95 = df94.withColumn('ngram', concat_udf('sanitized')) df95 = df95.drop('sanitized') df9 = model.transform(df95) df9pos = posModel.transform(df9).withColumnRenamed( 'prediction', 'Positive').drop('rawPrediction', 'probability', 'ngram') df10 = negModel.transform(df9pos).withColumnRenamed( 'prediction', 'Negative').drop('rawPrediction', 'probability', 'features') # df10.write.parquet("df10.parquet") # df9.write.parquet("df9.parquet") # posResult.write.parquet("df9pos.parquet") # negResult.write.parquet("df9neg.parquet") # task 10 # df9 = sqlContext.read.parquet("df9.parquet") # posResult = sqlContext.read.parquet("df9pos.parquet") # negResult = sqlContext.read.parquet("df9neg.parquet") states = [ 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming' ] # 1 df10.groupBy().avg('Positive', 'Negative').show() # +------------------+------------------+ # | avg(Positive)| avg(Negative)| # +------------------+------------------+ # |0.3285865077437555|0.4712543825234617| # +------------------+------------------+ # 2 byDate = df10.select( to_date(df10.created_utc.cast('timestamp')).alias('date'), df10.Positive, df10.Negative).groupBy('date').avg('Positive', 'Negative') byDate.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save('time_data.csv') # +----------+-------------------+ # | date| avg(prediction)| # +----------+-------------------+ # |2017-08-11| 0.3432203389830508| # |2017-09-11| 0.6041666666666666| # |2017-01-06|0.45698166431593795| # |2017-02-26| 0.2857142857142857| # |2017-01-27| 0.4057971014492754| # |2017-09-28| 0.495| # |2016-12-19|0.31800766283524906| # |2016-11-08| 0.3527644230769231| # |2017-01-24| 0.4975514201762977| # |2017-06-29| 0.3684210526315789| # |2017-09-29| 0.4827586206896552| # |2017-07-31| 0.4574898785425101| # |2017-02-16| 0.4444444444444444| # |2017-08-18| 0.4482758620689655| # |2017-12-02|0.37116564417177916| # |2017-08-14| 0.3838383838383838| # |2017-10-23| 0.4175824175824176| # |2017-12-25|0.44907407407407407| # |2017-04-09| 0.3409090909090909| # |2017-03-28| 0.4363143631436314| # +----------+-------------------+ # 3 byState = df10[df10.author_flair_text.isin(states)].groupBy( 'author_flair_text').avg('Positive', 'Negative') byState.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save('state_data.csv') # +-----------------+-------------------+ # |author_flair_text| avg(prediction)| # +-----------------+-------------------+ # | Utah|0.37254901960784315| # | Hawaii|0.42857142857142855| # | Minnesota| 0.3856893542757417| # | Ohio| 0.410427807486631| # | Oregon| 0.4177831912302071| # | Arkansas| 0.3548387096774194| # | Texas| 0.43359375| # | North Dakota| 0.4126984126984127| # | Pennsylvania|0.42705882352941177| # | Connecticut|0.40119760479041916| # | Vermont|0.38028169014084506| # | Nebraska| 0.4528301886792453| # | Nevada| 0.4110429447852761| # | Washington| 0.407436096049574| # | Illinois|0.44341801385681295| # | Oklahoma| 0.43| # | Delaware| 0.4| # | Alaska| 0.4423076923076923| # | New Mexico| 0.5076923076923077| # | West Virginia| 0.5| # +-----------------+-------------------+ # 4 byCommentScore = df10.groupBy('cscore').avg('Positive', 'Negative') byStoryScore = df10.groupBy('sscore').avg('Positive', 'Negative') byCommentScore.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save('comment_score.csv') byStoryScore.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save('story_score.csv') # 5 Locked vs Unlocked byDateLocked = df10.filter('locked == true').select( to_date(df10.created_utc.cast('timestamp')).alias('date'), df10.Positive, df10.Negative).groupBy('date').avg('Positive', 'Negative') byDateUnlocked = df10.filter('locked == false').select( to_date(df10.created_utc.cast('timestamp')).alias('date'), df10.Positive, df10.Negative).groupBy('date').avg('Positive', 'Negative') byDateLocked.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save('locked_data.csv') byDateUnlocked.repartition(1).write.format( "com.databricks.spark.csv").option("header", "true").save('unlocked_data.csv') # 5 over_18 byDate18 = df10.filter('over_18 == true').select( to_date(df10.created_utc.cast('timestamp')).alias('date'), df10.Positive, df10.Negative).groupBy('date').avg('Positive', 'Negative') byDate18.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save('over18_data.csv') # final 4 dff4 = df10.groupBy('title').agg( avg('Positive').alias('avgPos'), avg('Negative').alias('avgNeg')) dff4.orderBy('avgPos', ascending=0).limit(10).show(truncate=False) dff4.orderBy('avgNeg', ascending=0).limit(10).show(truncate=False)
def main(context): """Main Function takes a Spark SQL Context.""" #--------------------------------------------------------------------------- # TASK 1 # Code for task 1... # df = context.read.csv('labeled_data.csv') # df.write.parquet("labeled_data.parquet") # comments = context.read.json("comments-minimal.json.bz2") # comments.write.parquet("comments.parquet") # submissions = context.read.json("submissions.json.bz2") # submissions.write.parquet("submissions.parquet") labeled_data = context.read.parquet('labeled_data.parquet') labeled_data = labeled_data.withColumnRenamed("_c0", "Input_id")\ .withColumnRenamed("_c1", "labeldem")\ .withColumnRenamed("_c2", "labelgop")\ .withColumnRenamed("_c3", "labeldjt") # labeled_data.show() comments = context.read.parquet('comments.parquet') # comments.show() submissions = context.read.parquet('submissions.parquet') # submissions.show() #--------------------------------------------------------------------------- # TASK 2 # Code for task 2... labeled_comments = labeled_data.join(comments, comments.id == labeled_data.Input_id) labeled_comments = labeled_comments.select('Input_id', 'labeldjt', 'body') # labeled_comments.show() #--------------------------------------------------------------------------- # TASK 4 # Code for task 4... sanitize_udf = udf(sanitize, ArrayType(StringType())) #--------------------------------------------------------------------------- # TASK 5 # Code for task 5... sanitized_labeled_comments = labeled_comments.select( 'Input_id', 'labeldjt', sanitize_udf('body').alias('raw')) #--------------------------------------------------------------------------- # TASK 6A # Code for task 6A... cv = CountVectorizer(binary=True, minDF=10.0, inputCol="raw", outputCol="features") model = cv.fit(sanitized_labeled_comments) sanitized_labeled_comments = model.transform(sanitized_labeled_comments) sanitized_labeled_comments.show(truncate=False) countVectorizerPath = "count_vectorizer_model" model.save(countVectorizerPath) #--------------------------------------------------------------------------- # TASK 6B # Code for task 6B... # Labels: {1, 0, -1, -99} pos = sanitized_labeled_comments.select( sanitized_labeled_comments.features, sanitized_labeled_comments.labeldjt.cast(IntegerType())) pos = pos.withColumnRenamed("labeldjt", "label") pos = pos.replace(-1, 0) pos = pos.replace(-99, 0) # pos.show() neg = sanitized_labeled_comments.select( sanitized_labeled_comments.features, sanitized_labeled_comments.labeldjt.cast(IntegerType())) neg = neg.withColumnRenamed("labeldjt", "label") neg = neg.replace(1, 0) neg = neg.replace(-99, 0) neg = neg.replace(-1, 1) # neg.show() #--------------------------------------------------------------------------- # TASK 7 # Code for task 7... # ... MACHINE LEARNING PORTION TO TRAIN MODELS - Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("project2/pos.model") negModel.save("project2/neg.model") # Positive Model: posModel # Negative Model: negModel #--------------------------------------------------------------------------- # TASK 8 # Code for task 8... # ... Make Final Deliverable for Unseen Data - We don't need labeled_data anymore strip_t3_udf = udf(strip_t3, StringType()) sarcastic_or_quote_udf = udf(sarcastic_or_quote, BooleanType()) # Get Unseen Data sanitized_final_deliverable = comments.select('created_utc', strip_t3_udf(comments.link_id).alias('link_id'), 'author_flair_text', 'id', 'body', 'gilded', sanitize_udf('body').alias('raw'), comments.score.alias('c_score'))\ .filter(sarcastic_or_quote_udf(comments['body'])) #F.when(comments["body"].rlike('^>|\/s'), False).otherwise(True)) # sanitized_final_deliverable.show() #--------------------------------------------------------------------------- # TASK 9 # Code for task 9... # Load models that we saved on previous runs of this script model = CountVectorizerModel.load("count_vectorizer_model") posModel = CrossValidatorModel.load("project2/pos.model") negModel = CrossValidatorModel.load("project2/neg.model") # Sanitize TASK 8 - Run the CountVectorizerModel on TASK 8 Relation sanitized_final_deliverable = model.transform(sanitized_final_deliverable) # Run classifier on unseen data to get positive labels posResult = posModel.transform(sanitized_final_deliverable) # Rename the 3 new columns to prevent name conflicts posResult = posResult.withColumnRenamed("probability", "probability_pos")\ .withColumnRenamed("rawPrediction", "rawPrediction_pos")\ .withColumnRenamed("prediction", "prediction_pos") # Run the classifier on previous positive result to get negative labels too result = negModel.transform(posResult) # Rename the 3 new columns to make it easier to see which is which result = result.withColumnRenamed("probability", "probability_neg")\ .withColumnRenamed("rawPrediction", "rawPrediction_neg")\ .withColumnRenamed("prediction", "prediction_neg") # UDF functions for predicting label based on thresholds predict_pos_udf = udf(predict_pos, IntegerType()) predict_neg_udf = udf(predict_neg, IntegerType()) # Make predictions based on probability and threshold: result = result.select('created_utc', 'author_flair_text', 'link_id', 'id', 'c_score', 'gilded',\ predict_pos_udf(result.probability_pos).alias('pos'),\ predict_neg_udf(result.probability_neg).alias('neg')) result.write.parquet("result.parquet") # result.show() #--------------------------------------------------------------------------- # TASK 10 # Code for task 10... # ... Perform Analysis on the Predictions result = context.read.parquet("result.parquet") submissions = submissions.select('id', 'title', submissions.score.alias('s_score')) result = result.join(submissions, result.link_id == submissions.id) # .explain() result.show() context.registerDataFrameAsTable(result, "result") # 1. Percentage of Comments that Were Positive/Negative Across ALL Submissions task_10_1 = context.sql( "SELECT title, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY title" ) task_10_1.show() task_10_1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_1.csv") # 2. Percentage of Comments that Were Positive/Negative Across ALL Days task_10_2 = context.sql( "SELECT FROM_UNIXTIME(created_utc, 'Y-M-d') AS day, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY day ORDER BY day asc" ) task_10_2.show() task_10_2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_2.csv") # 3. Percentage of Comments that Were Positive/Negative Across ALL States context.registerFunction("check_state_udf", check_state, BooleanType()) task_10_3 = context.sql( "SELECT author_flair_text AS state, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result WHERE check_state_udf(author_flair_text) = True GROUP BY state" ) task_10_3.show() task_10_3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_3.csv") # 4A. Percentage of Comments that Were Positive/Negative Across ALL Comments task_10_4A = context.sql( "SELECT c_score AS comment_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY comment_score" ) task_10_4A.show() task_10_4A.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_4A.csv") # 4B. Percentage of Comments that Were Positive/Negative Across ALL Story Scores task_10_4B = context.sql( "SELECT s_score AS submission_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY submission_score" ) task_10_4B.show() task_10_4B.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_4B.csv") #--------------------------------------------------------------------------- # Extra Credit (Task 10) # 1. Percentage of Comments that Were Positive/Negative For Gilded and Non-Gilded Comments task_10_extra_credit = context.sql( "SELECT gilded, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY gilded" ) task_10_extra_credit.show() task_10_extra_credit.repartition(1).write.format( "com.databricks.spark.csv").option( "header", "true").save("task_10_extra_credit.csv")
# COMMAND ---------- def find_nearest_books(book_id, num): key = data_pca.filter(data_pca.book_id == book_id).select("features").collect()[0][0] res = model.approxNearestNeighbors(data_pca, key, num).select("book_id").collect() for r in res: print(get_book_title(r[0])) find_nearest_books(100001, 10) # COMMAND ---------- # DBTITLE 1,Latent Dirichlet allocation from pyspark.ml.clustering import LDA vectorizer = CountVectorizer(inputCol="filtered", outputCol="features") cv = vectorizer.fit(filteredData) featurizedData = cv.transform(filteredData) lda = LDA(k=20, maxIter=10) model = lda.fit(featurizedData) topics = model.describeTopics(3) print("The topics described by their top-weighted terms:") topics.show() transformed = model.transform(featurizedData) transformed.show() # COMMAND ----------
spark.createDataFrame(lista_sentimientos, StringType()).show() #print(dfSpark.show()) # COMMAND ---------- len_udf = udf(lambda s: len(s), IntegerType()) dfSpark = dfSpark.withColumn("token_count", len_udf(col('refined_tokens'))) dfSpark.orderBy(rand()).show(10) # COMMAND ---------- from pyspark.ml.feature import CountVectorizer # COMMAND ---------- count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='features') dfSpark_V = count_vec.fit(dfSpark).transform(dfSpark) dfSpark_V.select(['refined_tokens', 'token_count', 'features', 'Label']).show(10) # COMMAND ---------- model_df = dfSpark_V.select(['features', 'token_count', 'Label']) # COMMAND ---------- from pyspark.ml.feature import VectorAssembler # COMMAND ---------- df_assembler = VectorAssembler(inputCols=['features', 'token_count'],
def main(context): """Main function takes a Spark SQL context.""" # YOUR CODE HERE # YOU MAY ADD OTHER FUNCTIONS AS NEEDED # TASK 1 # Load the data into PySpark. # For the comments: if not os.path.exists("./comments.parquet"): comments = context.read.json("comments-minimal.json.bz2") comments.write.parquet("comments.parquet") # For the submissions: if not os.path.exists("./submissions.parquet"): submissions = context.read.json("submissions.json.bz2") submissions.write.parquet("submissions.parquet") #submissions.printSchema() # For labelled data: if not os.path.exists("./labels.parquet"): labels = context.read.format('csv').options( header='true', inferSchema='true').load("labeled_data.csv") labels.write.parquet("labels.parquet") # TASK 2 # Code for Task 2... # For task 2, we will join the labels and comments commentsParquet = context.read.parquet("comments.parquet") commentsParquet.createOrReplaceTempView("comments") labelsParquet = context.read.parquet("labels.parquet") labelsParquet.createOrReplaceTempView("labels") # Now, compute the join: if not os.path.exists("./joinedComments.parquet"): joinedComments = context.sql( "SELECT labels.Input_id, labels.labeldem, labels.labelgop, labels.labeldjt, body FROM comments JOIN labels on id=Input_id" ) joinedComments.write.parquet("joinedComments.parquet") joinedComments = context.read.parquet("joinedComments.parquet") joinedComments.createOrReplaceTempView("joinedComments") #joinedComments.printSchema() # TASK 3 # NOT NEEDED # TASK 4 # Register the user defined function context.registerFunction("sanitize", clean_wrapper, ArrayType(StringType())) # TASK 5 if not os.path.exists("./santized.parquet"): sanitizedText = context.sql( "SELECT Input_id, labeldem, labelgop, labeldjt, sanitize(body) as body FROM joinedComments" ) sanitizedText.write.parquet("sanitized.parquet") # TASK 6A sanitizedText = context.read.parquet("sanitized.parquet") sanitizedText.createOrReplaceTempView("sanitizedText") cv = CountVectorizer(inputCol="body", outputCol="features", minDF=10.0, binary=True) fitted = cv.fit(sanitizedText) vector = fitted.transform(sanitizedText) # TASK 6B vector.createOrReplaceTempView("vector") pos = context.sql("SELECT *, if(labeldjt=1, 1, 0) AS label FROM vector") neg = context.sql("SELECT *, if(labeldjt=-1, 1, 0) AS label FROM vector") # TASK 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("project2/pos.model") negModel.save("project2/neg.model") # TASK 8 and TASK 9 # Create the submissions and comments tables from the parquets: if not os.path.exists("sanitizedJoinedData.parquet"): submissions = context.read.parquet("submissions.parquet") submissions.createOrReplaceTempView("submissions") comments = context.read.parquet("comments.parquet") comments.createOrReplaceTempView("comments") comments = comments.sample(False, 0.2, None) joinedData = context.sql( "SELECT comments.link_id AS id, comments.body, comments.created_utc, submissions.title, comments.author_flair_text, submissions.score AS submission_score, comments.score as comments_score FROM comments JOIN submissions ON REPLACE(comments.link_id, 't3_', '')=submissions.id AND comments.body NOT LIKE '%/s%' AND comments.body NOT LIKE '>%'" ) #joinedData.show(joinedData.count(), False) #print(str(joinedData.count())) # Repeating earlier tasks: Tasks 4 and 5 joinedData.createOrReplaceTempView("joinedData") # Re-register temporary function since we are forced to: context.registerFunction("sanitize", clean_wrapper, ArrayType(StringType())) print("writing sanitized parquet now") sanitizedJoinedData = context.sql( "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, sanitize(body) AS body FROM joinedData" ) sanitizedJoinedData.write.parquet("sanitizedJoinedData.parquet") sanitizedJoinedData = context.read.parquet("sanitizedJoinedData.parquet") sanitizedJoinedData = sanitizedJoinedData.sample(False, 0.2, None) cv = CountVectorizer(inputCol="body", outputCol="features", minDF=10.0, binary=True) newVector = fitted.transform(sanitizedJoinedData) seenPosModel = CrossValidatorModel.load("project2/pos.model") seenNegModel = CrossValidatorModel.load("project2/neg.model") posResult = seenPosModel.transform(newVector) posResult = posResult.selectExpr("id", "created_utc", "title", "author_flair_text", "submission_score", "comments_score", "body", "features", "probability as positive_probability") cumResult = seenNegModel.transform(posResult) cumResult = cumResult.selectExpr("id", "created_utc", "title", "author_flair_text", "submission_score", "comments_score", "body", "features", "positive_probability", "probability as negative_probability") cumResult.createOrReplaceTempView("cumResult") context.registerFunction("positiveFunc", positiveUDF, IntegerType()) context.registerFunction("negativeFunc", negativeUDF, IntegerType()) cumResult = context.sql( "SELECT id, created_utc, title, author_flair_text, submission_score, comments_score, body, features, positiveFunc(positive_probability) AS positive_probability,negativeFunc(negative_probability) AS negative_probability FROM cumResult" ) cumResult.write.parquet("cumResult.parquet") # TASK 10 cumResult = context.read.parquet("cumResult.parquet") cumResult.createOrReplaceTempView("cumResult") # Actual 10.2 task10_6 = context.sql( "SELECT DATE(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created ORDER BY date_created" ) task10_6.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_6.csv") # Top 10 posts: if not os.path.exists("./task10_top_pos.csv"): task10_top_pos = cumResult.groupBy('title')\ .agg( (F.sum('positive_probability') / F.count(F.lit(1))).alias('pct_pos'), F.count(F.lit(1)).alias('count') )\ .orderBy(F.desc('pct_pos'), F.desc('count')).limit(10)\ .select('title', 'pct_pos') task10_top_pos.repartition( 1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_top_pos.csv") if not os.path.exists("./task10_top_neg.csv"): task10_top_neg = cumResult.groupBy('title')\ .agg( (F.sum('negative_probability') / F.count(F.lit(1))).alias('pct_neg'), F.count(F.lit(1)).alias('count') )\ .orderBy(F.desc('pct_neg'), F.desc('count')).limit(10)\ .select('title', 'pct_neg') task10_top_neg.repartition( 1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_top_neg.csv") # 10.1 # Get the number of records totalRows = cumResult.count() # Calculate percentages task10_1 = context.sql( "SELECT SUM(positive_probability)/ {0} AS pos, SUM(negative_probability)/{1} AS neg FROM cumResult" .format(totalRows, totalRows)) # 10.2 task10_2 = context.sql( "SELECT DAYOFWEEK(FROM_UNIXTIME(created_utc)) AS date_created, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult GROUP BY date_created" ) # 10.3 context.registerFunction("checkStateWrapper", checkState, BooleanType()) task10_3 = context.sql( "SELECT author_flair_text AS state, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/COUNT(negative_probability) AS neg FROM cumResult WHERE(checkStateWrapper(author_flair_text)) GROUP BY author_flair_text" ) # 10.4 task10_4 = context.sql( "SELECT comments_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY comments_score" ) task10_5 = context.sql( "SELECT submission_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY submission_score" ) # cumResult.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("cumResults.csv") task10_1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_1.csv") task10_2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_2.csv") task10_3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_3.csv") task10_4.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_4.csv") task10_5.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("task10_5.csv")
def train_model(): ''' if(dataRdd != None): print("**************************************************************************************************** Inside train model with new rdd") # Read the model pipeModel_Prev = PipelineModel.load('sentiment.model') # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W") # bag of words count countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5) # convert string labels to indexes label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label") nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial") # convert prediction to the predictedSentiment indexToLabels = IndexToString(inputCol = "prediction", outputCol = "predictedSentiment", labels=["bordem","love","relief", "fun", "hate", "neutral", "anger", "happiness", "surpirse","sadness","worry", "empty"]) # Buidl spark pipeline pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx, nb, indexToLabels]) # Fit the pipelin. pipeModel_Next = pipeline.fit(dataRDD) pipe_model_new = PipelineModel(stages = [pipeModel_Prev ,pipeModel_Next]) print("Workinggggggggggggggg") pipeModel_New.save("sentiment.model") ''' data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('text_emotion.csv') #Drop unused columns drop_list = ['tweet_id'] data = data.select([column for column in data.columns if column not in drop_list]) \ .where( (data['sentiment'] == 'empty') | (data['sentiment'] == 'sadness') | (data['sentiment'] == 'enthusiam') | (data['sentiment'] == 'worry') | (data['sentiment'] == 'surprise') | (data['sentiment'] == 'love') | (data['sentiment'] == 'hate') | (data['sentiment'] == 'anger') | (data['sentiment'] == 'neutral') | (data['sentiment'] == 'relief') | (data['sentiment'] == 'boredom') | (data['sentiment'] == 'fun') | (data['sentiment'] == 'happiness')) \ .na.drop(thresh=3) data.show(5) data.groupBy("sentiment") \ .count() \ .orderBy(col("count").desc()) \ .show() # set seed for reproducibility (trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W") # bag of words count countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5) # convert string labels to indexes label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label") nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial") # convert prediction to the predictedSentiment indexToLabels = IndexToString(inputCol = "prediction", outputCol = "predictedSentiment", labels=["bordem","love","relief", "fun", "hate", "neutral", "anger", "happiness", "surpirse","sadness","worry", "empty"]) # Buidl spark pipeline pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx, nb, indexToLabels]) # Fit the pipelin. pipelineFit = pipeline.fit(trainingData) predictions = pipelineFit.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 1) \ .select("content","sentiment", "predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 2) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 3) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 4) \ .select("content","sentiment","predictedSentiment", "probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 5) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 6) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 7) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 8) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 9) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 10) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 11) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # Retrive F1 accuracy score evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label") print("F1: %g" % (evaluator.evaluate(predictions))) pipelineFit.save("sentiment.model")
title_category = news_data.select('TITLE', 'CATEGORY') title_category.select('Category').distinct().count() title_category.groupBy('Category').count().orderBy(col('Count').desc()).show(truncate = False) title_category.groupBy('TITLE').count().orderBy(col('count').desc()).show(truncate = False) #### #Top 20 news categories: #regexp_replace: regular expression replacing! title_category = title_category.withColumn('only_str', regexp_replace(col('TITLE'), '\d+', '')) title_category.select('TITLE', 'only_str').show(truncate = False) #Top 20 news title: regex_tokenizer = RegexTokenizer(inputCol = 'only_str', outputCol = 'words', pattern = '\\W') raw_words = regex_tokenizer.transform(title_category) raw_words.show() remover = StopWordsRemover(inputCol = 'words', outputCol = 'filtered') word_df = remover.transform(raw_words) word_df.select('words', 'filtered').show(truncate = False) indexer = StringIndexer(inputCol = 'CATEGORY', outputCol = 'categoryIndex') feature_data = indexer.fit(word_df).transform(word_df) feature_data.show() cv = CountVectorizer(inputCol = 'filtered', outputCol = 'features')
def compute(sc, topLeft, bottomRight, step, datasetPath, k, gfs): sqlContext = SQLContext(sc) data = sc.textFile(datasetPath) data = data.mapPartitions(lambda x: csv.reader(x)) header = data.first() data = data.filter(lambda x: x != header) result_to_write = [] res_computation = [] step = check_step(topLeft, bottomRight, step) squares = get_squares(topLeft, bottomRight, step) # start computing elapsed time here start_time = time.time() data = data.map(lambda x: is_inside(x, topLeft, bottomRight, step, squares)). \ filter(lambda x: x is not None) data = data.map(remove_punctuation). \ map(split_string_into_array). \ filter(remove_empty_array). \ map(create_row). \ groupByKey(). \ map(lambda x : (x[0], list(x[1]))) # create the dataframes allDf = [] for df in data.collect(): if df: allDf.append([df[0], sqlContext.createDataFrame(df[1])]) for docDFs in allDf: docDF = docDFs[1] squareId = docDFs[0] StopWordsRemover.loadDefaultStopWords('english') newDocDF_eng = StopWordsRemover(inputCol="words", outputCol="filtered_eng"). \ transform(docDF) newDocDF_eng = newDocDF_eng.drop('words') StopWordsRemover.loadDefaultStopWords('italian') newDocDF_ita = StopWordsRemover(inputCol="filtered_eng", outputCol="filtered_ita"). \ transform(newDocDF_eng) newDocDF_ita = newDocDF_ita.drop('filtered_eng') StopWordsRemover.loadDefaultStopWords('german') newDocDF_ger = StopWordsRemover(inputCol="filtered_ita", outputCol="filtered_ger"). \ transform(newDocDF_ita) newDocDF_ger = newDocDF_ger.drop('filtered_ita') model = CountVectorizer(inputCol="filtered_ger", outputCol="vectors"). \ fit(newDocDF_ger) result = model.transform(newDocDF_ger) corpus = result.select("idd", "vectors").rdd.map(create_corpus).cache() # cluster the documents into the k topics using LDA ldaModel = LDA.train(corpus, k=k, maxIterations=100, optimizer='online') vocabArray = model.vocabulary wordNumbers = 10 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) toBePrinted = min(len(vocabArray), wordNumbers) topics_final = topicIndices.map( lambda x: topic_render(x, toBePrinted, vocabArray)).collect() # compute labels topics_label = [] for topic in topics_final: for topic_term in topic: if topic_term not in topics_label: topics_label.append(topic_term) break # print topics s = "; " res = "{}, {}, {}, {}, {}".format(topLeft.x, topLeft.y, bottomRight.x, bottomRight.y, s.join(topics_label)) result_to_write.append(res) res_computation.append(topics_label) end_time = time.time() elapsed_time = end_time - start_time result_to_write.append(elapsed_time) to_write = sc.parallelize(result_to_write) # get dataset size from file name size = datasetPath.split('.')[0].split('_')[1] if gfs: output_folder = "/tmp/Topic_Zoomer_" + str( time.ctime(start_time)).replace(' ', '_').replace(':', '-') + '_' + size else: output_folder = "Topic_Zoomer_" + str(time.ctime(start_time)).replace( ' ', '_').replace(':', '-') + '_' + size to_write.saveAsTextFile(output_folder) if gfs: copyHdfsCmd = 'hdfs dfs -copyToLocal {} {}'.format( output_folder, output_folder) copyBucketCmd = 'gsutil cp -r {} {}'.format(output_folder, gfs_output_path_hdfs) copyRecBucketCmd = 'gsutil cp -r {} {}'.format(recFileFolder, gfs_output_path_hdfs) copyHdfsRes = subprocess.call(shlex.split(copyHdfsCmd)) copyBucketRes = subprocess.call(shlex.split(copyBucketCmd)) copyRecBucketRes = subprocess.call(shlex.split(copyRecBucketCmd)) # some exit code checks if copyBucketRes or copyHdfsRes or copyRecBucketRes: print('hdfsRes: {}'.format(copyHdfsRes)) print('bucketResComp: {}'.format(copyBucketRes)) print('bucketResRec: {}'.format(copyRecBucketRes)) print('Something went wrong while copying results') return res_computation
def __init__(self): # Convert Pandas dataframe to PySpark dataframe. df = sqlContext.read.format("csv").option("header", "true").load("hotel-reviews.csv") # df = sqlContext.createDataFrame(pandas_df) # Change Reviewer_Score in Sentiment value (1 <= 5.5, 0 < 5.5) df = df.withColumn('Reviewer_Score', fn.when(df.Reviewer_Score >= 7.0, 1).otherwise(0)) df = df.withColumnRenamed('Reviewer_Score', 'Sentiment') # Concatenate the negative and positive to a single review text df_with_text = df.withColumn('Review_Text', fn.concat(fn.col('Negative_Review'), fn.lit(' '), fn.col('Positive_Review'))) # Strip Dataframe to only what is necessary for sentiment analysis df_stripped = df_with_text.select('Negative_Review', 'Positive_Review', 'Review_Text', 'Sentiment') # Importing Stopwords to filter out of the reviews to exclude stopwords stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split() # Configure tokenizer to extract words with only letters and save in column words tokenizer = RegexTokenizer().setGaps(False) \ .setPattern("\\p{L}+") \ .setInputCol("Review_Text") \ .setOutputCol("words") # Configure stopwords filter sw_filter = StopWordsRemover() \ .setStopWords(stop_words) \ .setCaseSensitive(False) \ .setInputCol("words") \ .setOutputCol("filtered") cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2 ** 17) \ .setInputCol("filtered") \ .setOutputCol("tf") # Create Pipeline with Tokenizer, Stopwords Filter and CountVectorizer cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(df_stripped) # Configure TFIDF idf = IDF(). \ setInputCol('tf'). \ setOutputCol('tfidf') idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(df_stripped) # Split data into training, validation and testing data (60%, 30%, 10%) training_df, validation_df, testing_df = df_stripped.randomSplit([0.6, 0.3, 0.1], seed=0) # Configure LogisticRegression for analysis of the reviews lr = LogisticRegression(). \ setLabelCol('Sentiment'). \ setFeaturesCol('tfidf'). \ setRegParam(0.0). \ setMaxIter(100). \ setElasticNetParam(0.) # Create new Pipelines for the LogisticRegression and train the model self.model = Pipeline(stages=[idf_pipeline, lr]).fit(training_df) # Calculate Score of our Model using the validation Dataframe self.model.transform(validation_df). \ select(fn.expr('float(prediction = Sentiment)').alias('correct')). \ select(fn.avg('correct')).show() spark = SparkSession \ .builder \ .appName("user_input_analysis") \ .getOrCreate()
# alltags=tags_users.map(lambda x:Counter(x.tags)).reduce(lambda a,b:a+b) # print(alltags.most_common(10)) #.filter(lambda x:len(x.tags)>100) # filtering to get smaller dataset # print(tags_users.count()) # print(tags_users.first()) ## Filtered for testing tags_users_df = sqlContext.createDataFrame(tags_users) print(tags_users_df.take(2)) # # # print('Indexing strings') cVec = CountVectorizer(inputCol='tags', outputCol="tag_features", minDF=10.) model = cVec.fit(tags_users_df) td = model.transform(tags_users_df) with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl', mode='wb') as ff: pkl.dump(model.vocabulary, ff) normalizer = Normalizer(p=1., inputCol='tag_features', outputCol='tags_normalized') tdNorm = normalizer.transform(td) print(tdNorm.take(5)) tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet')
def get_trending_news(rdd): if not rdd.isEmpty(): spark = getSparkSessionInstance(rdd.context.getConf()) df = spark.createDataFrame(rdd) # Append the title and summary together df_news_concat = df.withColumn("news_content", fn.concat_ws(" ", df.title, df.summary)) df_punc_removed = df_news_concat.withColumn( "news_content_removed", fn.regexp_replace(df_news_concat.news_content, "\p{Punct}", "")) udf_remove_unicode = fn.udf( lambda x: x.encode("ascii", "ignore").decode("ascii")) df_news_content_ascii = df_punc_removed.withColumn( "news_content_ascii", udf_remove_unicode(df_punc_removed.news_content_removed)) # insert raw data to the cassandra table df_news_content_ascii.select("id", "news_provider", "published", "summary", "title") \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table="travel_news_data", keyspace="news_stream_analysis") \ .save(mode="append") tokenizer = Tokenizer(inputCol="news_content_ascii", outputCol="content_words") df_tokenized_content = tokenizer.transform(df_news_content_ascii).drop( "news_content") remover = StopWordsRemover(inputCol="content_words", outputCol="filtered_words") stop_words = remover.loadDefaultStopWords("english") stop_words.extend([ '', "travel", "trip", "submitted", "abc", "reditt", "by", "time", "timing", "comments", "comment", "thank", "link", "im", "thanks", "would", "like", "get", "good", "go", "may", "also", "going", "dont", "want", "see", "take", "looking", "" ]) remover.setStopWords(stop_words) df_stop_words_removed = remover.transform(df_tokenized_content).drop( "content_words") cv = CountVectorizer(inputCol="filtered_words", outputCol="rawFeatures") cv_model = cv.fit(df_stop_words_removed) df_tf_data = cv_model.transform(df_stop_words_removed) df_features = df_tf_data.select( df_tf_data.rawFeatures.alias("features")) def convert_term_indices_to_term(term_indices, vocab): terms = [] for t in term_indices: terms.append(vocab[t]) return str(terms) # LDA lda = LDA(k=5, maxIter=50, learningOffset=8192.0, learningDecay=0.50) model = lda.fit(df_features) df_topics = model.describeTopics() fn_term_indices_to_term = fn.udf(convert_term_indices_to_term) vocab_lit = fn.array(*[fn.lit(k) for k in cv_model.vocabulary]) df_lda_result = df_topics.withColumn( "terms", fn_term_indices_to_term("termIndices", vocab_lit)) df_lda_result.select("topic", "termIndices", "terms").show(truncate=False) df_lda_result.cache() lda_terms = df_lda_result.select("terms").collect() lda_terms_list = [str(i.terms) for i in lda_terms] # based on model terms choose news stories for term_list in lda_terms_list: s = [] topic_words = term_list[1:-1].split(",") for term in topic_words: term = term.split("'")[1] s.append(r"(^|\W)" + str(term) + r"($|\W)") rx = '|'.join('(?:{0})'.format(x.strip()) for x in s) df_results = df_news_content_ascii.filter( df_news_content_ascii['news_content_ascii'].rlike(rx)) df_results = df_results.withColumn("topic_words", fn.lit(str(topic_words)[1:-1])) df_results = df_results.withColumn("results_date", fn.lit(datetime.datetime.now())) # insert results with the raw data to the cassandra table df_results.select("id", "news_provider", "published", "summary", "title", "topic_words", "results_date") \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table="travel_news_data_results", keyspace="news_stream_analysis") \ .save(mode="append")
def train_cv_model(modelDataframe): cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0) model = cv.fit(modelDataframe) model.write().overwrite().save("models/cvModel")
data = data.withColumn('length', length(data['text'])) data.show() # In[3]: # Compare the lenght difference between ham and spam data.groupby('class').mean().show() # In[4]: # Treat TF-IDF features for each text # TF: Term Frequency # IDF: Inverse Document Frequency from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label') final_feature = VectorAssembler(inputCols=['tf_idf', 'length'],outputCol='features') from pyspark.ml import Pipeline data_prep_pipe = Pipeline(stages=[ham_spam_to_num,tokenizer,stopremove,count_vec,idf,final_feature]) clean_data = data_prep_pipe.fit(data).transform(data) clean_data.show() clean_data.take(1) clean_data.take(1)[0][-1] # In[4*]: # Select features column and tansfrom to Pandas dataframe df = clean_data.select('features').toPandas()
def main(context): """Main function takes a Spark SQL context.""" # the read is from the parquet file comments = sqlContext.read.parquet("comments-minimal.parquet") submissions = sqlContext.read.parquet("submissions.parquet") # only look at columns that are useful comments = comments.select("id","created_utc","body","author_flair_text", "link_id", "score").\ withColumnRenamed("score", "commentscore") submissions = submissions.select("id", "title", "score").\ withColumnRenamed("score", "storyscore") #comments.write.parquet("comments-minimal.parquet") #submissions.write.parquet("submissions.parquet") labeled_data = sqlContext.read.format("csv").options(header='true', inferSchema='true').load('labeled_data.csv') #here we do the join on comment id joined = comments.join(labeled_data, comments.id == labeled_data.Input_id) #sanitize_new ignores processed string given by sanitize from cleantext import sanitize def sanitize_new(text): r = sanitize(text)[1:] return r[0].split(" ")+r[1].split(" ")+r[2].split(" ") #create the udf, generate new column of n-grams sanitize_udf = udf(sanitize_new, ArrayType(StringType())) joined = joined.withColumn("ngrams", sanitize_udf(joined.body)) #6a: construct feature vector based on "ngrams" #store the transformed column in "features" #CountVectroizer produces sparse vector by default so no need to change cv = CountVectorizer(inputCol="ngrams", outputCol = "features",minDF=5.0, binary=True) cv_model = cv.fit(joined) joined = cv_model.transform(joined) #6b: construct pos column and neg column #for this project, only look at label on Trump pos_udf = udf(lambda label: 1 if label == 1 else 0 ,IntegerType()) neg_udf = udf(lambda label: 1 if label ==-1 else 0 ,IntegerType()) joined = joined.withColumn("poslabel", pos_udf(joined.labeldjt)) joined = joined.withColumn("neglabel", neg_udf(joined.labeldjt)) #7: train logistic regression model #code adopted from project spec # #Initialize two logistic regression models. # poslr = LogisticRegression(labelCol="poslabel", featuresCol="features", maxIter=10) # neglr = LogisticRegression(labelCol="neglabel", featuresCol="features", maxIter=10) # poslr.setThreshold(0.2) # neglr.setThreshold(0.25) # # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. # posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel") # negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel") # # There are a few parameters associated with logistic regression. We do not know what they are a priori. # # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # # We will assume the parameter is 1.0. Grid search takes forever. # posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() # negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # # We initialize a 5 fold cross-validation pipeline. # posCrossval = CrossValidator( # estimator=poslr, # evaluator=posEvaluator, # estimatorParamMaps=posParamGrid, # numFolds=5) # negCrossval = CrossValidator( # estimator=neglr, # evaluator=negEvaluator, # estimatorParamMaps=negParamGrid, # numFolds=5) # # Although crossvalidation creates its own train/test sets for # # tuning, we still need a labeled test set, because it is not # # accessible from the crossvalidator (argh!) # # Split the data 50/50 # posTrain, posTest = joined.randomSplit([0.5, 0.5]) # negTrain, negTest = joined.randomSplit([0.5, 0.5]) # # Train the models # print("Training positive classifier...") # posModel = posCrossval.fit(posTrain) # print("Training negative classifier...") # negModel = negCrossval.fit(negTrain) # # save the models # posModel.save("www/pos.model") # negModel.save("www/neg.model") #load instead posModel = CrossValidatorModel.load("www/pos.model") negModel = CrossValidatorModel.load("www/neg.model") print("finished loading model") #8.2 title of submission of the comment comments = comments.withColumn("clean_id", regexp_replace("link_id", r'^t3_', '')) comments = comments.join(submissions, comments.clean_id == submissions.id).drop(submissions.id) #9 #filter out comments with "\s" and starts with ">" comments = comments.filter(~comments.body.rlike(r'^>')).\ filter(~comments.body.rlike(r'\\s')) #sample comments = comments.sample(False, sampleRate, None) # 1 serves as the seed so model is reproducible #redo 4,5,6a comments = comments.withColumn("ngrams", sanitize_udf(comments.body)) comments = cv_model.transform(comments) print("done with transforming the sampled comments") #make predictions comments = posModel.transform(comments).\ drop("body", "link_id", "clean_id", "ngrams","rawPrediction", "probability").\ withColumnRenamed("prediction", "poslabel") comments = negModel.transform(comments).drop("features", "rawPrediction", "probability").\ withColumnRenamed("prediction", "neglabel") #10 #1. compute the percentage of positive, negative comments print("Percentage of positive comments") result = comments.select('poslabel').groupBy().avg() result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("pos-perc.csv") print("Percenetage of negative comments") result = comments.select('neglabel').groupBy().avg() result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("neg-perc.csv") #2. by date comments = comments.withColumn("date", from_unixtime(comments.created_utc, "YYYY-MM-dd")) result = comments.groupBy("date").agg({"poslabel" : "mean", "neglabel" : "mean"}) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("time_data.csv") #3. by state val_state_udf = udf(lambda state: state if state in states else None, StringType()) comments = comments.withColumn("state", val_state_udf(lower(comments.author_flair_text))) comments = comments.filter(comments.state.isNotNull()) result = comments.groupBy("state").agg({"poslabel" : "mean", "neglabel" : "mean"}) result.show(truncate=False) print(result.count()) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("state_data.csv") #4a. by comment score result = comments.groupBy("commentscore").agg({"poslabel" : "mean", "neglabel" : "mean"}) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("comment_score.csv") #4b. by story score result = comments.groupBy("storyscore").agg({"poslabel" : "mean", "neglabel" : "mean"}) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("story_score.csv")
def main(inputs): amazon_schema = types.StructType([ types.StructField('marketplace', types.StringType()), types.StructField('customer_id', types.IntegerType()), types.StructField('review_id', types.StringType()), types.StructField('product_id', types.StringType()), types.StructField('product_parent', types.LongType()), types.StructField('product_title', types.StringType()), types.StructField('product_category', types.StringType()), types.StructField('star_rating', types.IntegerType()), types.StructField('helpful_votes', types.IntegerType()), types.StructField('total_votes', types.IntegerType()), types.StructField('vine', types.StringType()), types.StructField('verified_purchase', types.StringType()), types.StructField('review_headline', types.StringType()), types.StructField('review_body', types.StringType()), types.StructField('review_date', types.DateType()) ]) input_df = spark.read.parquet(inputs) input_df = input_df.repartition(96) #input_df.show() #print("No of rows in input dataset:",inputs," is:",input_df.count()) StopWords = stopwords.words("english") start_time = time.time() tokens = input_df.rdd.map(lambda x: x['review_headline'])\ .filter(lambda x: x is not None)\ .map( lambda document: document.strip().lower())\ .map( lambda document: re.split(" ", document))\ .map( lambda word: [x for x in word if x.isalpha()])\ .map( lambda word: [x for x in word if len(x) > 3] )\ .map( lambda word: [x for x in word if x not in StopWords])\ .zipWithIndex() df_txts = spark.createDataFrame(tokens, ["list_of_words", 'index']) # TF cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=5000, minDF=10.0) cvmodel = cv.fit(df_txts) result_cv = cvmodel.transform(df_txts) # IDF idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv) #result_tfidf.show() num_topics = 10 max_iterations = 100 lda = LDA(k=num_topics, maxIter=max_iterations) lda_model = lda.fit(result_tfidf.select('index', 'features')) wordNumbers = 5 #topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = wordNumbers)) topics = lda_model.describeTopics(maxTermsPerTopic=wordNumbers) topics.show(truncate=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--docs_path', default='data/wiki-sample/AA') parser.add_argument('-p', '--prepro_path', default='data/prepro') parser.add_argument('-q', '--queries_path', default='data/queries/sample.json') parser.add_argument('-o', '--output_path', default='data/output') parser.add_argument('-m', '--mode', choices=['prepro', 'fit', 'query'], default='prepro') parser.add_argument('-dl', '--docs_limit', type=int) parser.add_argument('-ql', '--queries_limit', type=int) parser.add_argument('-il', '--inverted_index_limit', type=int, default=5000) args = parser.parse_args() print('Running BigramPipeline with args: {}'.format(args)) spark = SparkSession.builder.appName('BigramModel').getOrCreate() tokenIdsUdf = udf(lambda x: x.indices.tolist(), ArrayType(IntegerType())) tfIdfModelPath = os.path.join(args.prepro_path, 'tf_idf_model') docsTfIdfPath = os.path.join(args.prepro_path, 'docs_tf_idf') docsTokenIdsPath = os.path.join(args.prepro_path, 'docs_token_ids') docsBigramsPath = os.path.join(args.prepro_path, 'docs_bigrams') parser = WikiParser(inputCol='text', outputCol='text_parsed', minParagraphs=1, minCharacters=500) tokenizer = Tokenizer(inputCol='text_parsed', outputCol='unigrams') ngrams = NGram(inputCol='unigrams', outputCol='bigrams', n=2) concat = Concat(inputCols=['unigrams', 'bigrams'], outputCol='tokens') if args.mode == 'prepro': spark.sparkContext.setJobGroup('input', 'Read input data') docs = spark.read.json(args.docs_path) if args.docs_limit is not None: docs = docs.limit(args.docs_limit) spark.sparkContext.setJobGroup('parse_docs', 'Parse wiki documents') docsParsed = parser.transform(docs) docsParsed = checkpoint(spark, docsParsed, os.path.join(args.prepro_path, 'docs_parsed')) spark.sparkContext.setJobGroup('tokenize', 'Tokenize documents') docsTokenized = tokenizer.transform(docsParsed) docsTokenized = checkpoint( spark, docsTokenized, os.path.join(args.prepro_path, 'docs_tokenized')) spark.sparkContext.setJobGroup('ngrams', 'Compute bigrams') docsBigrams = ngrams.transform(docsTokenized) docsBigrams = concat.transform(docsBigrams) docsBigrams.write.parquet(docsBigramsPath) elif args.mode == 'fit': spark.sparkContext.setJobGroup('input', 'Read input data') docsBigrams = spark.read.parquet(docsBigramsPath).select( 'id', 'tokens') tf = CountVectorizer(inputCol='tokens', outputCol='tf', vocabSize=10000000, minDF=2.0, minTF=3.0) idf = IDF(inputCol='tf', outputCol='idf') spark.sparkContext.setJobGroup('tf', 'Fit TF model') tfModel = tf.fit(docsBigrams) docsTf = tfModel.transform(docsBigrams) docsTf = checkpoint(spark, docsTf, os.path.join(args.prepro_path, 'docs_tf')) spark.sparkContext.setJobGroup('idf', 'Fit IDF model') idfModel = idf.fit(docsTf) docsTfIdf = idfModel.transform(docsTf) docsTfIdf = docsTfIdf.select(docsTfIdf.id.alias('doc_id'), docsTfIdf.idf.alias('doc_idf')) docsTfIdf = checkpoint(spark, docsTfIdf, docsTfIdfPath) tfIdfModel = PipelineModel( stages=[tokenizer, ngrams, concat, tfModel, idfModel]) tfIdfModel.save(tfIdfModelPath) spark.sparkContext.setJobGroup('docs_token_ids', 'Compute inverted index') docsTokenIds = docsTfIdf.select( docsTfIdf.doc_id, explode(tokenIdsUdf(docsTfIdf.doc_idf)).alias('token_id')) docsTokenIds.write.parquet(docsTokenIdsPath) elif args.mode == 'query': assert args.queries_path is not None spark.sparkContext.setJobGroup('input', 'Read input data') tfIdfModel = PipelineModel.load(tfIdfModelPath) docsTfIdf = spark.read.parquet(docsTfIdfPath) docsTokenIds = spark.read.parquet(docsTokenIdsPath) queries = spark.read.json(args.queries_path) if args.queries_limit is not None: queries = queries.limit(args.queries_limit) queries = queries.select(queries._id.alias('query_id'), queries.question.alias('text_parsed')) spark.sparkContext.setJobGroup('queries_tf_idf', 'Apply TF-IDF to queries') queriesTfIdf = tfIdfModel.transform(queries) queriesTfIdf = queriesTfIdf.select(queriesTfIdf.query_id, queriesTfIdf.tf.alias('query_tf')) queriesTfIdf = checkpoint( spark, queriesTfIdf, os.path.join(args.output_path, 'queries_tf_idf')) print('Finished query TF IDF') spark.sparkContext.setJobGroup('queries_token_ids', 'Compute query token IDs') queriesTokenIds = queriesTfIdf.select( queriesTfIdf.query_id, explode(tokenIdsUdf(queriesTfIdf.query_tf)).alias('token_id')) queriesTokenIds = checkpoint( spark, queriesTokenIds, os.path.join(args.output_path, 'queries_token_ids')) print('Finished query token IDs') spark.sparkContext.setJobGroup('doc_queries', 'Perform inverted index filtering') docQueries = docsTokenIds.join(queriesTokenIds, on='token_id').groupby( 'query_id', 'doc_id').count() window = Window.partitionBy(docQueries.query_id).orderBy( col('count').desc()) docQueries = docQueries.withColumn('rank', row_number().over(window)) \ .filter(col('rank') <= args.inverted_index_limit) \ .select('query_id', 'doc_id') docQueries = checkpoint(spark, docQueries, os.path.join(args.output_path, 'doc_queries')) print('Finished inverted index filter') spark.sparkContext.setJobGroup('score', 'Perform scoring') docQueries = docQueries.join(docsTfIdf, on='doc_id').join(queriesTfIdf, on='query_id') \ .select('query_id', 'doc_id', 'query_tf', 'doc_idf') docQueries = Dot(inputCols=['doc_idf', 'query_tf'], outputCol='score').transform(docQueries) queryResults = docQueries.select('query_id', 'doc_id', 'score') queryResults.write.parquet( os.path.join(args.output_path, 'query_results')) print('Wrote output to {}'.format(args.output_path)) spark.stop()
#path para os dados de teste e treino (no mesmo diretorio do codigo) pathTrain = ["dataset_train.csv"] pathTest = ["dataset_test.csv"] #preparacao dos dataframes de treino e teste data_treino = spark.read.load(pathTrain, format="csv", header=True) #dataset treino data_test = spark.read.load(pathTest, format="csv", header=True) #dataset test print(" Dados de treino") data_treino.select("*").show() #declacao de stopwords, tokenizacao, idf, formacao do vocabulario tk = Tokenizer(inputCol="Conteudo", outputCol="tokens") swr = StopWordsRemover(inputCol="tokens", outputCol="words") cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize=100000) idf = IDF(inputCol="rawFeatures", outputCol="features") #pipeline dos processos declarados para os dados de teste e treino pipeline = Pipeline(stages=[tk, swr, cv, idf]) model_pipe = pipeline.fit(data_treino) data_treino = model_pipe.transform(data_treino) model_pipe = pipeline.fit(data_test) data_test = model_pipe.transform(data_test) #Geracao do modelo e teste mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5) model = mh.fit(data_treino) data_treino = model.transform(data_treino)
.select(F.col('date').cast('date'), 'note', F.col('duration').cast('int')) maintenance.show(5, truncate=False) # ### Sample of 2-word nGrams on Maintenance Notes tk = Tokenizer(inputCol="note", outputCol="words") # Tokenize maintTokenized = tk.transform(maintenance) swr = StopWordsRemover(inputCol="words", outputCol="filtered") # Remove stop-words maintFiltered = swr.transform(maintTokenized) ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams") # 2-word nGrams maintNGrams = ngram.transform(maintFiltered) maintNGrams.select('ngrams').show(5, truncate=False) # ### Topic Clustering using Latent Dirichlet Allocation (LDA) # LDA is a form of un-supervised machine learning that identifies clusters, or topics, # in the data cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=50)\ .fit(maintNGrams) # CountVectorize converts nGram array into a vector of counts maintVectors = cv.transform(maintNGrams) vocabArray = cv.vocabulary lda = LDA(k=3, maxIter=10) ldaModel = lda.fit(maintVectors) ldaModel.write().overwrite().save('lda.mdl') topics = ldaModel.describeTopics(5) # We see below that each maintenance log can be clustered based on its text into # 1 of 3 topics below. The nGrams in each cluster show clearly 3 types of maintenance # activities # 1. Preventive maintenance occurs when the we have 'abnormal readings' or a 'component replacement' # 2. Corrective maintenance occurs when we have a 'asset shutdown' event or 'asset failure' # 3. The rest of the logs indicate that no downtime is required (ie. 'maintenance tests passed', 'asset healthy') for topic in topics.collect():
# limitations under the License. # from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import CountVectorizer # $example off$ if __name__ == "__main__": spark = SparkSession.builder.appName( "CountVectorizerExample").getOrCreate() # $example on$ # Input data: Each row is a bag of words with a ID. df = spark.createDataFrame([(0, "a b c".split(" ")), (1, "a b b c a".split(" "))], ["id", "words"]) # fit a CountVectorizerModel from the corpus. cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0) model = cv.fit(df) result = model.transform(df) result.show() # $example off$ spark.stop()
param: Takes in ''' text = record[3] # The 3rd column corresponds to the review text words = text.split() return words udf_formattext = udf(cleanup_text_format, ArrayType(StringType())) clean_text = reviews.withColumn( "reviewTextArray", udf_formattext(struct([reviews[x] for x in reviews.columns]))) # Count Vectorizor Convert a collection of text documents to vectors of token counts cv = CountVectorizer(inputCol="reviewTextArray", outputCol="rawFeatures", vocabSize=1000) cvmodel = cv.fit(clean_text) featurizedData = cvmodel.transform(clean_text) vocab = cvmodel.vocabulary vocab_broadcast = sc.broadcast(vocab) def map_termID_to_Word(termIndices): ''' Map each term index back to its original word param (list of integers): Each element correponds to a word represented by an integer returns (list of str): Returns the words which are converted from their respective strings ''' return [vocab_broadcast.value[termID] for termID in termIndices]