-
Notifications
You must be signed in to change notification settings - Fork 4
/
youtube_history.py
executable file
·374 lines (328 loc) · 14.1 KB
/
youtube_history.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Downloads, analyzes, and reports all Youtube videos associated with a user's Google account.
"""
import json
import os
import pickle
import argparse
import getpass
import subprocess as sp
import sys
from collections import namedtuple
from pathlib import Path
from webbrowser import open_new_tab
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from flask import Flask
from flask import render_template
from bs4 import BeautifulSoup
from emoji import emoji_lis
from grapher import Grapher, flatten_without_nones
DEPRECATION_NOTE = """
This method of downloading data is deprecated.
It uses youtube-dl to login to your Google account.
This is error-prone, as Google may think you are a bot.
Instead, you should go to https://takeout.google.com/,
and follow directions there to download your "YouTube and YouTube Music" data.
Then you can re-run this program specifying the `--takeout` flag,
pointing to the *unzipped* directory you downloaded from Google.
Do you want to continue anyway? [y/n]:
"""
app = Flask(__name__)
@app.route('/', methods=['GET', 'POST'])
def index():
return render_template('index.html', analysis=analysis)
def launch_web():
app.debug = False
app.secret_key = 'this key should be complex'
file1 = os.path.join(analysis.raw, '00001.info.json')
some_data = os.path.isfile(file1)
if some_data:
url = 'http://127.0.0.1:5000'
open_new_tab(url)
app.run()
def make_fake_series(title='N/A', webpage_url='N/A', **kwargs):
params = ['title', 'webpage_url'] + list(kwargs.keys())
Mock = namedtuple('MockSeries', params)
return Mock(title, webpage_url, **kwargs)
class Analysis:
"""Main class responsible for downloading and analyzing data.
Parameters
----------
takeout : Optional[str]
'Path to an unzipped Takeout folder downloaded from https://takeout.google.com/'
outpath : str (default='data')
The path to the directory where both raw and computed results should be stored.
delay : float (default=0)
Amount of time in seconds to wait between requests.
Attributes
----------
raw : str
Path to 'raw' directory in self.path directory
ran : str
Path to 'ran' directory in self.path directory
df : Dataframe
Pandas Dataframe used to store compiled results
tags : [[str]]
A list of tags for each downloaded video
grapher : Grapher
Creates the interactive graphs portion of the analysis
seconds : int
The sum of video durations
formatted_time : str
Seconds converted to W/D/H/M/S format
most_viewed : Series
Video with the most total views
least_viewed : DataFrame
Collection of at most 10 videos with single digit views
best_per_decile : DataFrame
10 videos, one per view_count decile, where each video as the highest average rating in that decile
worse_per_decile : DataFrame
Same as best_per_decile, but lowest average rating
emojis: Series
Video with the most unique emojis in the description
oldest_videos : Dataframe
First 10 videos watched on user's account.
oldest_upload : Series
Video with the oldest upload date to youtube.
HD : int
The number of videos that have high-definition resolution
UHD : int
The number of videos that have ultra-high-definition resolution
top_uploaders : Series
The most watched channel names with corresponding video counts
funny_counts : int
The max number of times a video's description says the word 'funny'
funny : Series
The 'funniest' video as determined by funny_counts
"""
def __init__(self, takeout=None, outpath='data', delay=0):
self.takeout = Path(takeout).expanduser()
self.path = Path(outpath)
self.delay = delay
self.raw = os.path.join(self.path, 'raw') # TODO use Path
self.ran = os.path.join(self.path, 'ran') # TODO use Path
self.df = None
self.tags = None
self.grapher = None
self.seconds = None
self.formatted_time = None
self.most_viewed = None
self.least_viewed = None
self.best_per_decile = None
self.worst_per_decile = None
self.emojis = None
self.oldest_videos = None
self.oldest_upload = None
self.HD = None
self.UHD = None
self.top_uploaders = None
self.funny = None
self.funny_counts = None
def download_data(self):
"""Uses Takeout to download individual json files for each video."""
watch_history = self.takeout / 'YouTube and YouTube Music/history/watch-history.html'
if not watch_history.is_file():
raise ValueError(f'"{watch_history}" is not a file. Did you download your YouTube data? ')
print('Extracting video urls from Takeout.'); sys.stdout.flush()
try:
text = watch_history.read_text()
except UnicodeDecodeError:
text = watch_history.read_text(encoding='utf-8')
soup = BeautifulSoup(text, 'html.parser')
urls = [u.get('href') for u in soup.find_all('a')]
videos = [u for u in urls if 'www.youtube.com/watch' in u]
url_path = self.path / 'urls.txt'
url_path.write_text('\n'.join(videos))
print(f'Urls extracted. Downloading data for {len(videos)} videos now.')
output = os.path.join(self.raw, '%(autonumber)s')
cmd = f'youtube-dl -o "{output}" --skip-download --write-info-json -i -a {url_path}'
p = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.STDOUT, shell=True)
line = True
while line:
line = p.stdout.readline().decode("utf-8").strip()
print(line)
def deprecated_download_data_via_youtube_dl_login(self):
"""Uses youtube_dl to download individual json files for each video."""
result = input(DEPRECATION_NOTE)
if result.lower() != 'y':
sys.exit()
print('Okay, Let\'s login and download some data.')
successful_login = False
while not successful_login:
successful_login = True
user = input('Google username: ')
pw = getpass.getpass('Google password: ')
files = os.path.join(self.raw, '%(autonumber)s')
if not os.path.exists(self.raw):
os.makedirs(self.raw)
template = ('youtube-dl -u "{}" -p "{}" '
'-o "{}" --sleep-interval {} '
'--skip-download --write-info-json -i '
'https://www.youtube.com/feed/history ')
fake = template.format(user, '[$PASSWORD]', files, self.delay)
print(f'Executing youtube-dl command:\n\n{fake}\n')
cmd = template.format(user, pw, files, self.delay)
p = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.STDOUT, shell=True)
while True:
line = p.stdout.readline().decode("utf-8").strip()
print(line)
if line == 'WARNING: unable to log in: bad username or password':
successful_login = False
if not line:
break
def df_from_files(self):
"""Constructs a Dataframe from the downloaded json files.
All json keys whose values are not lists are compiled into the dataframe.
The dataframe is then saved as a csv file in the self.ran directory.
The tags of each video are pickled and saved as tags.txt
"""
print('Creating dataframe...')
num = len([name for name in os.listdir(self.raw) if not name[0] == '.'])
files = os.path.join(self.raw, '~.info.json') # This is a weird hack
files = files.replace('~', '{:05d}') # It allows path joining to work on Windows
data = [json.load(open(files.format(i))) for i in range(1, num + 1)]
columns = ['formats', 'tags', 'categories', 'thumbnails']
lists = [[], [], [], []]
deletes = {k: v for k, v in zip(columns, lists)}
for dt in data:
for col, ls in deletes.items():
ls.append(dt[col])
del dt[col]
self.df = pd.DataFrame(data)
self.df['upload_date'] = pd.to_datetime(self.df['upload_date'], format='%Y%m%d')
self.df.to_csv(os.path.join(self.ran, 'df.csv'))
self.tags = deletes['tags']
pickle.dump(self.tags, open(os.path.join(self.ran, 'tags.txt'), 'wb'))
def make_wordcloud(self):
"""Generate the wordcloud file and save it to static/images/."""
print('Creating wordcloud')
wordcloud = WordCloud(width=1920,
height=1080,
relative_scaling=.5)
flat_tags = flatten_without_nones(self.tags)
wordcloud.generate(' '.join(flat_tags))
wordcloud.to_file(os.path.join('static', 'images', 'wordcloud.png'))
def check_df(self):
"""Create the dataframe and tags from files if file doesn't exist."""
if not os.path.exists(self.ran):
os.makedirs(self.ran)
df_file = os.path.join(self.ran, 'df.csv')
if os.path.isfile(df_file):
self.df = pd.read_csv(df_file, index_col=0, parse_dates=[-11])
self.tags = pickle.load(open(os.path.join(self.ran, 'tags.txt'), 'rb'))
self.df['upload_date'] = pd.to_datetime(self.df['upload_date'])
else:
self.df_from_files()
def total_time(self):
"""The amount of time spent watching videos."""
self.seconds = self.df.duration.sum()
seconds = self.seconds
intervals = (
('years', 31449600), # 60 * 60 * 24 * 7 * 52
('weeks', 604800), # 60 * 60 * 24 * 7
('days', 86400), # 60 * 60 * 24
('hours', 3600), # 60 * 60
('minutes', 60),
('seconds', 1)
)
result = []
for name, count in intervals:
value = seconds // count
if value:
seconds -= value * count
if value == 1:
name = name.rstrip('s')
result.append("{} {}".format(int(value), name))
self.formatted_time = ', '.join(result)
def best_and_worst_videos(self):
"""Finds well liked and highly viewed videos"""
self.most_viewed = self.df.loc[self.df['view_count'].idxmax()]
low_views = self.df[self.df['view_count'] < 10]
self.least_viewed = low_views.sample(min(len(low_views), 10), random_state=0)
self.df['deciles'] = pd.qcut(self.df['view_count'], 10, labels=False)
grouped = self.df.groupby(by='deciles')
self.best_per_decile = self.df.iloc[grouped['average_rating'].idxmax()]
self.worst_per_decile = self.df.iloc[grouped['average_rating'].idxmin()]
def most_emojis_description(self):
def _emoji_variety(desc):
return len({x['emoji'] for x in emoji_lis(desc)})
counts = self.df['description'].apply(_emoji_variety)
self.emojis = self.df.iloc[counts.idxmax()]
def funniest_description(self):
"""Counts number of times 'funny' is in each description. Saves top result."""
funny_counts = []
descriptions = []
index = []
for i, d in enumerate(self.df.description):
try:
funny_counts.append(d.lower().count('funny'))
descriptions.append(d)
index.append(i)
except AttributeError:
pass
funny_counts = np.array(funny_counts)
funny_counts_idx = funny_counts.argmax()
self.funny_counts = funny_counts[funny_counts_idx]
if self.funny_counts > 0:
self.funny = self.df.iloc[index[funny_counts_idx]]
else:
title = 'Wait, 0? You\'re too cool to watch funny videos on youtube?'
self.funny = make_fake_series(title, average_rating='N/A')
def three_randoms(self):
"""Finds results for video resolutions, most popular channels, and funniest video."""
height = self.df['height'].astype(int)
self.HD = self.df[(720 <= height) & (height <= 1080)].shape[0]
self.UHD = self.df[height > 1080].shape[0]
self.top_uploaders = self.df.uploader.value_counts().head(n=15)
self.funniest_description()
def compute(self):
print('Computing...')
self.total_time()
self.best_and_worst_videos()
self.most_emojis_description()
self.oldest_videos = self.df[['title', 'webpage_url']].tail(n=10)
self.oldest_upload = self.df.loc[self.df['upload_date'].idxmin()]
self.three_randoms()
def graph(self):
self.grapher = Grapher(self.df, self.tags)
self.grapher.average_rating()
self.grapher.duration()
self.grapher.views()
self.grapher.gen_tags_plot()
def start_analysis(self):
self.check_df()
if WordCloud is not None:
self.make_wordcloud()
self.compute()
self.graph()
def run(self):
"""Main function for downloading and analyzing data."""
file1 = os.path.join(self.raw, '00001.info.json')
some_data = os.path.isfile(file1)
if not some_data:
if self.takeout is not None:
self.download_data()
else:
self.deprecated_download_data_via_youtube_dl_login()
some_data = os.path.isfile(file1)
if some_data:
self.start_analysis()
else:
print('No data was downloaded.')
if __name__ == '__main__':
print('Welcome!'); sys.stdout.flush()
parser = argparse.ArgumentParser()
parser.add_argument("-o", '--out', default='data',
help="Path to empty directory for data storage.")
parser.add_argument('-d', '--delay', default=0,
help='Time to wait between requests. May help avoid 2FA.')
parser.add_argument('-t', '--takeout',
help='Path to an unzipped Takeout folder downloaded from https://takeout.google.com/')
args = parser.parse_args()
analysis = Analysis(args.takeout, args.out, float(args.delay))
analysis.run()
launch_web()