/
You_Tube_Comments_Analysis.py
262 lines (217 loc) · 12.5 KB
/
You_Tube_Comments_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# 0. Data Exploration and Cleaning
df_clean=spark.read.csv("/FileStore/tables/animals_comments.csv", inferSchema=True, header=True)
df_clean.show(3)
df_clean.count()
df_clean = df_clean.na.drop(subset=["comment"])
df_clean.count()
# Explore the data
df_clean.show()
# Label the data
# find user with preference of dog and cat
from pyspark.sql.functions import when
from pyspark.sql.functions import col
# you can user your ways to extract the label
df_clean = df_clean.withColumn("label", \
(when(col("comment").like("%my dog%"), 1) \
.when(col("comment").like("%I have a dog%"), 1) \
.when(col("comment").like("%my cat%"), 1) \
.when(col("comment").like("%I have a cat%"), 1) \
.when(col("comment").like("%my puppy%"), 1) \
.when(col("comment").like("%my pup%"), 1) \
.when(col("comment").like("%my kitty%"), 1) \
.when(col("comment").like("%my pussy%"), 1) \
.otherwise(0)))
df_clean.show()
# 1. Data preprocesing and build the classifier
from pyspark.ml.feature import RegexTokenizer, Word2Vec
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="comment", outputCol="words", pattern="\\W")
word2Vec = Word2Vec(inputCol="words", outputCol="features")
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[regexTokenizer, word2Vec])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df_clean)
dataset = pipelineFit.transform(df_clean)
dataset.show()
# Remove the emtpy features caused by none English statements.
from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
isnotEmpty = udf(lambda x: len(x) != 0, BooleanType())
dataset_noEmpty = dataset.filter(isnotEmpty('words'))
dataset_noEmpty.show()
(lable0_train,lable0_test)=dataset_noEmpty.filter(col('label')==1).randomSplit([0.7, 0.3],seed = 100)
(lable1_train, lable1_ex)=dataset_noEmpty.filter(col('label')==0).randomSplit([0.005, 0.995],seed = 100)
(lable1_test, lable1_ex2)=lable1_ex.randomSplit([0.002, 0.998],seed = 100)
trainingData = lable0_train.union(lable1_train)
testData=lable0_test.union(lable1_test)
print("Dataset Count: " + str(dataset.count()))
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
#Build your ML model
#LogisticRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
lr = LogisticRegression(maxIter=10, regParam=0.3)
# Fit the model
lrModel = lr.fit(trainingData)
# Take a look at prediction on training set because we don't want to touch test samples.
# Cross valition and grid-search based finetuning will be applied later.
predictions = lrModel.transform(trainingData)
predictions.select('comment', 'features', 'rawPrediction', 'probability', 'prediction', 'label').show(10)
# Evaluate model using AUC = 0.945, which is good but also can be due to the overfitting
# We are going to go further with cross validation
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('AUC value on training samples: ' + '%.3f' % evaluator.evaluate(predictions))
# Model hyperparameter searching
#Parameter Tuning and K-fold cross-validation
#Note: The choice of hyperparameters is not optimal, especially the maxIter, owing to the running time concern.
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation.
# We want to finetune regParam and maxIter.
lr = LogisticRegression()
paramGrid = (ParamGridBuilder()
.addGrid(lr.regParam, [0.5, 2])
.addGrid(lr.maxIter, [1, 2, 5])
.build())
# Create 3-fold CrossValidator based on AUC.
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
# Run cross validations
cvModel_lr = cv.fit(trainingData)
# Use test set to measure the accuracy of our model on new data
predictions = cvModel_lr.transform(testData)
# Evaluate best model
print('AUC value of best Logistic Regression model on test samples: ' + '%.3f' % evaluator.evaluate(predictions))
# Display best hyper-parameters
print('Best regParam: ' + '%.2f' % cvModel_lr.bestModel._java_obj.getRegParam())
print('Best regParam: ' + str(cvModel_lr.bestModel._java_obj.getMaxIter()))
bestModel_lr = cvModel_lr.bestModel
#Try random forest model
#RandomForest
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import RandomForestClassifier
# Create ParamGrid for Cross Validation.
# We want to finetune maxDepth, maxBins and numTrees.
rf = RandomForestClassifier()
paramGrid = (ParamGridBuilder()
.addGrid(rf.maxDepth, [2, 4])
.addGrid(rf.maxBins, [20, 60])
.addGrid(rf.numTrees, [5, 10])
.build())
# Create 3-fold CrossValidator based on AUC.
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
# Run cross validations
cvModel_rf = cv.fit(trainingData)
# Use test set to measure the accuracy of our model on new data
predictions = cvModel_rf.transform(testData)
# Evaluate best model
print('AUC value of best RandomForest model on test samples: ' + '%.3f' % evaluator.evaluate(predictions))
# Display best hyper-parameters
print('Best maxDepth: ' + '%.2f' % cvModel_rf.bestModel._java_obj.getMaxDepth())
print('Best maxBins: ' + str(cvModel_rf.bestModel._java_obj.getMaxBins()))
print('Best numTrees: ' + str(cvModel_rf.bestModel._java_obj.getNumTrees()))
bestModel_rf = cvModel_rf.bestModel
# try GDBT
# Gradient boosting
rom pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import GBTClassifier
# Create ParamGrid for Cross Validation.
# We want to finetune maxDepth, maxBins and maxIter.
gdbt = GBTClassifier()
paramGrid = (ParamGridBuilder()
.addGrid(gdbt.maxDepth, [2, 4])
.addGrid(gdbt.maxBins, [20, 60])
.addGrid(gdbt.maxIter, [5, 10])
.build())
# Create 3-fold CrossValidator based on AUC.
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=gdbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
# Run cross validations
cvModel_gdbt = cv.fit(trainingData)
# Use test set to measure the accuracy of our model on new data
predictions = cvModel_gdbt.transform(testData)
# Evaluate best model
print('AUC value of best GDBT model on test samples: ' + '%.3f' % evaluator.evaluate(predictions))
# Display best hyper-parameters
#print('Best maxDepth: ' + str(cvModel_gdbt.bestModel._java_obj.getMaxDepth()))
#print('Best maxBins: ' + str(cvModel_gdbt.bestModel._java_obj.getMaxBins()))
#print('Best maxIter: ' + str(cvModel_gdbt.bestModel._java_obj.getMaxIter()))
bestModel_gdbt = cvModel_gdbt.bestModel
print('AUC value of best GDBT model on test samples: ' + '%.3f' % evaluator.evaluate(predictions))
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import GBTClassifier
# Create ParamGrid for Cross Validation.
# We want to finetune maxDepth, maxBins and maxIter.
gdbt = GBTClassifier()
paramGrid = (ParamGridBuilder()
.addGrid(gdbt.maxDepth, [2, 4])
.addGrid(gdbt.maxBins, [20, 60])
.addGrid(gdbt.maxIter, [5, 10])
.build())
# Create 3-fold CrossValidator based on AUC.
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
cv = CrossValidator(estimator=gdbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
# Run cross validations
cvModel_gdbt = cv.fit(trainingData)
# Use test set to measure the accuracy of our model on new data
predictions = cvModel_gdbt.transform(testData)
# Evaluate best model
#print('AUC value of best GDBT model on test samples: ' + '%.3f' % evaluator.evaluate(predictions))
# Display best hyper-parameters
#print('Best maxDepth: ' + str(cvModel_gdbt.bestModel._java_obj.getMaxDepth()))
#print('Best maxBins: ' + str(cvModel_gdbt.bestModel._java_obj.getMaxBins()))
#print('Best maxIter: ' + str(cvModel_gdbt.bestModel._java_obj.getMaxIter()))
bestModel_gdbt = cvModel_gdbt.bestModel
#Get the best model with best hyper-parameter
# According to the AUC result on test samples, GDBT with maxDepth=4, maxBins=20, and maxIter=10, is the best model.
best_model = bestModel_gdbt
#Apply the best model
# 2 Classify all the users
# Predict over all comments
predictions_over_comments = best_model.transform(dataset_noEmpty)
# Predict over all users. If a user has more than one comments, he or she has more than one prediction.
# We assume that we want to find the potential buyer so we don't want to miss any candidates.
# As a result, we apply max-win algorithm, which mean unless all prediction is 0, the user is marked as 1.
from pyspark.sql import functions as F
predictions_over_users = predictions_over_comments.groupBy('userid').agg(F.max('prediction').alias('predictions_over_users'))
predictions_over_users.show(5)
# Display the percetage of cat or dog owner.
#print('%.2f% of users are cat or dog owner.' % (predictions_over_users.filter(F.col('predictions_over_users') == 1).count()/predictions_over_users.count()*100))
print(predictions_over_users.filter(F.col('predictions_over_users') == 1).count()/predictions_over_users.count()*100)
#investigate the reasons from the text
# 3 get insight of users
# First, select cat or dog owners from the dataset
cat_dog_owner = ((predictions_over_users.filter(F.col('predictions_over_users') == 1)).join(predictions_over_comments, ['userid'])).select('userid', 'comment', 'words','predictions_over_users','creator_name')
# Second, find top 10 popular words in cat and dot owners' comments.
# In particular, common words, such as 'and', 'I', 'you', and 'we', have been kicked out.
common_words = ['i', 'the', 'and', 'a', 'to', 'you', 'is', 'it', 'of', 'my',
'that', 'in', 'so', 'for', 'have', 'this', 'your', 'are',
'was', 'on', 'with', 'but', 'he', 'they', 'be', 'me',
'just', 'do', 'all', 'one', 'not', 'what', 'im', 'if',
'get', 'when', 'them', 'its', 'she', 'would', 'can',
'her', 'at', 'or', 'how', 'as', 'up', 'out', 'him',
'dont', 'we', 'from', 'about', 'will', 'see', 'his',
'great', 'there', 'know', 'had', 'really', 'people',
'because', 'much', 'an', 'lol', 'got', 'more', 'some',
'want', 'no', 'think', 'videos', 'has', 'very', 'now',
'u', 'go', 'too', 'day', 'these', 'who', 'little',
'did', 'by', 'their', 'could', 'make', 'been', 'hope',
'3', 'should', 'also', 'am', 'always', 'why', 'keep',
'were', 'well', 'those', 'then' ,'going', 'never',
'thats', 'cant', 'only', 'new', 'way', 'other', 'look',
'need', 'please', 'take', 'first']
popular_words = cat_dog_owner.withColumn('word', F.explode(F.col('words'))).filter(~F.col('word').isin(common_words)).groupBy('word').count().sort('count', ascending=False)
popular_words.show(10)
# 4. Identify creators with cat and dog owners in the text
# Display the top 10 creator, who has the largest amount of cat and dog owner comments
creators = cat_dog_owner.groupBy('creator_name').count().sort('count', ascending=False)
creators.show(10)
# 5. Analysis and Future work
#In this project, we aim to build a model to identify cat or dog owners based on the comments for youtube videos related to animials or pets and then we also try to find out the topics interest them mostly.
#Totally, we have more than 5 million samples and we first remove the samples with no comments or with non-Enlish comments and we also label a comment based on if it contains sub-sentence like 'I have a pet' or 'my dog'.
#In the following, we finetune and select the model among logistic regression, random forest, and gradient boosting using cross-validation according to the area under the ROC curve (AUC). Finally, gradient boosting provides the best AUC value (i.e., 0.939). With the selected model, we #classify all the users and also extract insights about cat and dog owners and find topics important to cat and dog owners.
#In the future work, we can further optimized the model when more computation source is available.