-
Notifications
You must be signed in to change notification settings - Fork 0
/
TorecSubtitlesDownloader.py
237 lines (198 loc) · 9.23 KB
/
TorecSubtitlesDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import cookielib
import datetime
import zipfile
import urllib2
import urllib
import codecs
import shutil
import time
import os
import sys
import re
import zlib
import os.path
from BeautifulSoup import BeautifulSoup
from SubtitleHelper import log
def convert_file(inFile,outFile):
''' Convert a file in cp1255 encoding to utf-8
:param inFile: the path to the intput file
:param outFile: the path to the output file
'''
with codecs.open(inFile,"r","cp1255") as f:
with codecs.open(outFile, 'w', 'utf-8') as output:
for line in f:
output.write(line)
return
class SubtitleOption(object):
def __init__(self, name, id):
self.name = name
self.id = id
def __repr__(self):
return "%s" % (self.name)
class SubtitlePage(object):
def __init__(self, id, name, url, data):
self.id = id
self.name = name
self.url = url
self.options = self._parseOptions(data)
def _parseOptions(self, data):
subtitleSoup = BeautifulSoup(data)
subtitleOptions = subtitleSoup("div", {'class' : 'download_box' })[0].findAll("option")
filteredSubtitleOptions = filter(lambda x: x.has_key("value"), subtitleOptions)
return map(lambda x: SubtitleOption(x.string.strip(), x["value"]), filteredSubtitleOptions)
class Response(object):
def __init__(self, response):
self.data = self._handleData(response)
self.headers = response.headers
def _handleData(self, resp):
data = resp.read()
if (len(data) != 0):
try:
data = zlib.decompress(data, 16+zlib.MAX_WBITS)
except zlib.error:
pass
return data
class FirefoxURLHandler():
def __init__(self):
cj = cookielib.CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
self.opener.addheaders = [('Accept-Encoding','gzip, deflate'),
('Accept-Language', 'en-us,en;q=0.5'),
('Pragma', 'no-cache'),
('Cache-Control', 'no-cache'),
('User-Agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:16.0) Gecko/20100101 Firefox/16.0')]
def request(self, url, data=None, ajax=False, referer=None, cookie=None):
if (data != None):
data = urllib.urlencode(data)
# FIXME: Awful code duplication
if (ajax == True):
self.opener.addheaders += [('X-Requested-With', 'XMLHttpRequest')]
if (referer != None):
self.opener.addheaders += [('Referer', referer)]
if (cookie != None):
self.opener.addheaders += [('Cookie', cookie)]
resp = self.opener.open(url, data)
return Response(resp)
class TorecSubtitlesDownloader:
DEFAULT_SEPERATOR = " "
BASE_URL = "http://www.torec.net"
SUBTITLE_PATH = "sub.asp?sub_id="
DEFAULT_COOKIE = "Torec_NC_s=%(screen_width)d; Torec_NC_sub_%(subId)s=sub=%(current_datetime)s"
def __init__(self):
self.urlHandler = FirefoxURLHandler()
def _buildDefaultCookie(self, subID):
currentTime = datetime.datetime.now().strftime("%m/%d/%Y+%I:%M:%S+%p")
return self.DEFAULT_COOKIE % {"screen_width" : 1760,
"subId" : subID,
"current_datetime" : currentTime}
def searchMovieName(self, movieName):
response = self.urlHandler.request("%s/ssearch.asp" % self.BASE_URL, {"search" : movieName})
match = re.search('sub\.asp\?sub_id=(\w+)', response.data)
if (match is None):
return None
id = match.groups()[0]
subURL = "%s/%s%s" % (self.BASE_URL, self.SUBTITLE_PATH, id)
subtitleData = self.urlHandler.request(subURL).data
return SubtitlePage(id, movieName, subURL, subtitleData)
def findChosenOption(self, name, subtitlePage):
name = name.replace(".", " ").replace("-", " ").split()
# Find the most likely subtitle (the subtitle which adheres to most of the movie properties)
maxLikelihood = 0
chosenOption = None
for option in subtitlePage.options:
subtitleName = self.sanitize(option.name).split(" ")
subtitleLikelihood = 0
for token in subtitleName:
if token in name:
subtitleLikelihood += 1
if (subtitleLikelihood > maxLikelihood):
maxLikelihood = subtitleLikelihood
chosenOption = option
return chosenOption
def _requestSubtitle(self, subID, subURL):
params = {"sub_id" : subID,
"s" : 1760}
return self.urlHandler.request("%s/ajax/sub/guest_time.asp" % self.BASE_URL, params,
ajax=True, referer=subURL, cookie=self._buildDefaultCookie(subID)).data
def getDownloadLink(self, subID, optionID, subURL, persist=True):
requestID = self._requestSubtitle(subID, subURL)
params = {"sub_id" : subID, "code": optionID, "sh" : "yes", "guest" : requestID, "timewaited" : "16"}
for i in xrange(16):
response = self.urlHandler.request("%s/ajax/sub/downloadun.asp" % self.BASE_URL, params, ajax=True)
if (len(response.data) != 0 or not persist):
break
time.sleep(1)
return response.data
def download(self, downloadLink):
response = self.urlHandler.request("%s%s" % (self.BASE_URL, downloadLink))
fileName = re.search("filename=(.*)", response.headers["content-disposition"]).groups()[0]
return (response.data, fileName)
def saveData(self, fileName, data, shouldUnzip=True):
log(__name__ ,"Saving to %s (size %d)" % (fileName, len(data)))
# Save the downloaded zip file
with open( fileName,"wb") as f:
f.write(data)
if shouldUnzip:
# Unzip the zip file
log(__name__ ,"Unzip the zip file")
zipDirPath = os.path.dirname(fileName)
zip = zipfile.ZipFile(fileName, "r")
zip.extractall(zipDirPath)
zip.close()
# Remove the unneeded zip file
os.remove(fileName)
if len((os.listdir(zipDirPath))) > 1:
raise RuntimeError("subtitle directory should be temporary, sound two files in it")
srtFile = os.listdir(zipDirPath)[0]
if srtFile.endswith(".srt"):
srtFile = os.path.join(zipDirPath,srtFile)
#convert file from cp1255 to utf-8
log(__name__, "Convering to utf-8 %s" % srtFile)
tempFileName=srtFile+ ".tmp"
convert_file(srtFile,tempFileName)
shutil.copy(tempFileName,srtFile)
os.remove(tempFileName)
return os.path.basename(srtFile)
def sanitize(self, name):
return re.sub('[\.\[\]\-]', self.DEFAULT_SEPERATOR, name.upper())
def getSubtitleMetaData(self, movieName):
sanitizedName = self.sanitize(movieName)
log(__name__ , "Searching for %s" % sanitizedName)
subtitlePage = self.searchMovieName(sanitizedName)
if subtitlePage is None:
log(__name__ ,"Couldn't find relevant subtitle page")
return None
else:
log(__name__ , "Found relevant meta data")
return subtitlePage
def getBestMatchID(self, name, subtitlePage):
chosen_option = self.findChosenOption(name, subtitlePage)
if chosen_option != None:
return chosen_option.id
else:
return None
def getSubtitleData(self, movieName, resultSubtitleDirectory):
subtitlePage = self.getSubtitleMetaData(movieName)
# Try to choose the most relevant option according to the file name
chosenOption = self.findChosenOption(subtitlePage.name, subtitlePage)
if chosenOption != None:
log(__name__ ,"Found the subtitle type - %s" % chosenOption)
else:
log(__name__ ,"No suitable subtitle found!")
log(__name__ ,"Available options are:")
options = enumerate(subtitlePage.options, start=1)
for num, option in options:
log(__name__ ,"\t(%d) %s" % (num, option))
choice = int(raw_input("What subtitle do you want to download? "))
while (choice < 0 or choice > len(subtitlePage.options)):
log(__name__ ,"bad choice")
choice = int(raw_input("What subtitle do you want to download? "))
chosenOption = subtitlePage.options[choice-1]
# Retrieve the download link and download the subtitle
downloadLink = self.getDownloadLink(subtitlePage.id, chosenOption.id, subtitlePage.url)
if (downloadLink == ""):
log(__name__ ,"Download Unsuccessful!")
return
(subtitleData, subtitleName) = self.download(downloadLink)
resultSubtitlePath = os.path.join(resultSubtitleDirectory, subtitleName)
self.saveData(resultSubtitlePath, subtitleData)