forked from KA-Advocates/KATranslationCheck
/
YTSubtitles.py
executable file
·112 lines (99 loc) · 4.27 KB
/
YTSubtitles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
"""
Simple requests-based (non-cached) interface
"""
import requests
from ansicolor import black
import json
from collections import defaultdict
from multiprocessing import Pool
def getExercises():
return requests.get("https://www.khanacademy.org/api/v1/exercises").json()
def getExerciseVideos(name):
return requests.get("https://www.khanacademy.org/api/v1/exercises/{0}/videos".format(name)).json()
def hasYTTimedText(v, lang="de"):
url = "https://www.youtube.com/api/timedtext?caps=asr&key=yttt1&v=%s&lang=%s&fmt=srv2" % (v, lang)
response = requests.get(url)
return bool(response.text) # Empty if not available
def fetchVideos(lang):
"Fetch a list of videos for a given language"
res = requests.get("https://www.khanacademy.org/api/internal/translate_now?lang={0}".format(lang))
json = res.json()
return json["nodes"]["videos"]
def findDubbedVideos(lang):
"Find only videos which are dubbed"
videos = fetchVideos(lang)
return [k for k, v in videos.items() if v["dubbed"]]
def fetchOriginalVideoURL(vid):
"Depreated: Fix original video ID for a slug"
response = requests.get("https://www.khanacademy.org/api/v1/videos/{0}".format(vid)).json()
# Strip off extra YT attributes
return response["url"].partition("&")[0].replace("http:", "https:")
timedtextRE = re.compile(r"^https?://www\.youtube\.com/timedtext_video\?v=(.*)$")
def getTranslatedVideoId(vid, lang):
"Deprecated: Find redirect for a specific video ID"
url = "https://www.khanacademy.org/translate/videos/{0}/subtitle?lang={1}&dub=1".format(vid, lang)
response = requests.get(url, allow_redirects=False)
location = response.headers["location"]
match = timedtextRE.match(location)
if match is not None:
return "https://www.youtube.com/watch?v={0}".format(match.group(1))
else: return None
def fetchVideoMap(pool, lang):
"Deprecated: Fetch individual video dubs"
print(black("Fetching dubbed video list for lang {0}".format(lang), bold=True))
dubbedVideoIDs = findDubbedVideos(lang)
print(black("Fetching {0} dubbed video URLs for lang {1}"
.format(len(dubbedVideoIDs), lang), bold=True))
fn = functools.partial(getTranslatedVideoId, lang=lang)
videoURLs = pool.map(fn, dubbedVideoIDs)
# Remap
return {videoId: url
for videoId, url in zip(dubbedVideoIDs, videoURLs)
if url is not None}
if __name__ == "__main__":
#Download exercise list
print(black("Downloading master exercise list...", bold=True))
exercises = getExercises()
#Download videos
print(black("Downloading exercise video list for {0} exercises..."
.format(len(exercises)), bold=True))
pool = Pool(32)
exVideos = pool.map(getExerciseVideos, [e["name"] for e in exercises])
with open("fo.json","w") as outf:json.dump(exVideos, outf)
# Perform mapping
print(black("Mapping videos...", bold=True))
result = []
allVideoIDs = set() # Need that for subtitle mapping
for exercise, videos in zip(exercises, exVideos):
current_videos = []
for video in videos:
current_videos.append({
"title": video["title"],
"youtube_id": video["youtube_id"],
"duration": video["duration"],
})
allVideoIDs.add(video["youtube_id"])
exercise = {
"videos": current_videos,
"id": exercise["name"],
"title": exercise["title"],
"url": exercise["ka_url"],
}
result.append(exercise)
allVideoIDs = list(allVideoIDs) # Need guaranteed order
# Fetch subtitles for videos (reuse pool)
print(black("Checking subtitles for {0} videos..."
.format(len(allVideoIDs)), bold=True))
hasSubtitles = pool.map(hasYTTimedText, allVideoIDs)
hasSubtitleMap = {
v: hasSubs for v, hasSubs in zip(allVideoIDs, hasSubtitles)}
# Injects
print(black("Remapping subtitle information to video...", bold=True))
for exercise in result:
for video in exercise["videos"]:
video["has_subtitles"] = hasSubtitleMap[video["youtube_id"]]
# Write to output file
print(black("Writing to videos.json...", bold=True))
with open("videos.json", "w") as outfile:
json.dump(result, outfile)