-
Notifications
You must be signed in to change notification settings - Fork 0
/
pytrailer.py
249 lines (206 loc) · 7.95 KB
/
pytrailer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import codecs
import json
import re
import locale
import logging
from time import mktime
try:
import urllib2 as urllib
urllib.request = urllib
from HTMLParser import HTMLParser
except ImportError:
# python3.x
import urllib.request, urllib.error, urllib.parse
from html.parser import HTMLParser
import dateutil.parser as dparser
def getMoviesFromJSON(jsonURL):
"""Main function for this library
Returns list of Movie classes from apple.com/trailers json URL
such as: http://trailers.apple.com/trailers/home/feeds/just_added.json
The Movie classes use lazy loading mechanisms so that data not
directly available from JSON are loaded on demand. Currently these
lazy loaded parts are:
* poster
* trailerLinks
* description
Be warned that accessing these fields can take long time due to
network access. Therefore do the loading in thread separate from
UI thread or your users will notice.
There are optional fields that may or may not be present in every
Movie instance. These include:
* actors (list)
* directors (list)
* rating (string)
* genre (string)
* studio (string)
* releasedate (sring)
Please take care when trying to access these fields as they may
not exist.
"""
response = urllib.request.urlopen(jsonURL)
jsonData = response.read().decode('utf-8')
objects = json.loads(jsonData)
# make it work for search urls
if jsonURL.find('quickfind') != -1:
objects = objects['results']
optionalInfo = ['actors','directors','rating','genre','studio','releasedate']
movies = []
for obj in objects:
movie = Movie()
movie.title = obj['title']
movie.baseURL = obj['location']
movie.posterURL = obj['poster']
# sometimes posters don't have http part
if movie.posterURL.find('http:') == -1:
movie.posterURL = "http://apple.com%s" % movie.posterURL
movie.trailers = obj['trailers']
for i in optionalInfo:
if i in obj:
setattr(movie, i, obj[i])
movies.append(movie)
return movies
class Movie(object):
"""Main class representing all trailers for single Movie
Most fields should be self-descriptive
"""
def __init__(self):
self.title = None
# URL of poster for the movie
self.posterURL = None
# base URL of movie such as "/trailers/magnolia/nightcatchesus/"
self.baseURL = None
# trailers as present in JSON URL (not used)
self.trailers = []
self._posterData = None
self._trailerLinks = None
self._description = None
def get_trailerLinks(self):
"""Returns dictionary with trailer names as keys and list of
trailer urls as values. Each trailer can have more links due
to different qualities.
Example:
{'Trailer':['url1','url2'],'Featurette':['url1','url2']}
"""
if self._trailerLinks:
return self._trailerLinks
wip = WebIncParser("http://trailers.apple.com" + self.baseURL,
"includes/playlists/web.inc")
self._trailerLinks = wip.getTrailers()
return self._trailerLinks
def set_trailerLinks(self, val):
self._trailerLinks = val
trailerLinks = property(get_trailerLinks, set_trailerLinks)
def get_poster(self):
"""Returns poster data itself (as in JPEG/GIF/PNG file)"""
if self._posterData:
return self._posterData
response = urllib.request.urlopen(self.posterURL)
self._posterData = response.read()
return self._posterData
def set_poster(self, val):
self._posterData = val
poster = property(get_poster, set_poster)
def get_description(self):
"""Returns description text as provided by the studio"""
if self._description:
return self._description
try:
trailerURL= "http://trailers.apple.com%s" % self.baseURL
response = urllib.request.urlopen(trailerURL)
Reader = codecs.getreader("utf-8")
responseReader = Reader(response)
trailerHTML = responseReader.read()
description = re.search('<meta *name="Description" *content="(.*?)" *[/]*>'
,trailerHTML)
if description:
self._description = description.group(1)
else:
self._description = "None"
except:
self._description = "Error"
return self._description
def set_description(self, val):
self._description = val
description = property(get_description, set_description)
def get_latest_trailer_date(self):
"""Returns date (unix timestamp) of latest trailer for this movie
"""
tsMax = 0
for trailer in self.trailers:
locale.setlocale(locale.LC_ALL, "C")
pdate = dparser.parse(trailer['postdate'])
locale.resetlocale()
ts = mktime(pdate.timetuple())
if ts > tsMax:
tsMax = ts
return tsMax
class WebIncParser(HTMLParser):
"""Class for parsing data from web.inc html files that exist for
every Movie
Each movie has associated web.inc file that contains pieces of
html containing trailer names (as in "Trailer", "Featurette"
etc) and links to trailers themselves.
"""
H3 = 1
URLS = 2
def __init__(self, baseURL, relativeURL, parsedURLS=None):
HTMLParser.__init__(self)
self.trailers = {}
self.dirtyURLS = set()
self.pos = 0
self.baseURL = baseURL
self.URL = baseURL + relativeURL
if not parsedURLS:
self.parsedURLS = set()
else:
self.parsedURLS = parsedURLS
self.parsedURLS.add(relativeURL)
def getTrailers(self):
"""Returns dictionary with trailer names as keys and list of
trailer urls as values. Each trailer can have more links due
to different qualities.
data - HTML page containing Trailer names/links
Example:
{'Trailer':['url1','url2'],'Featurette':['url1','url2']}
"""
response = urllib.request.urlopen(self.URL)
logging.info("Processing: " + self.URL)
data = response.read().decode('utf-8')
self.pos = 0
self.feed(data)
self.close()
if not self.trailers:
return self.dirtyURLS
return self.trailers
def handle_starttag(self, tag, attrs):
nested_includes = ()
if tag.lower() == 'h3':
for name, val in attrs:
if name == 'title' and self.dirtyURLS:
self.trailers[val]=self.dirtyURLS
self.dirtyURLS = set()
self.pos = self.H3
elif tag.lower() == 'a':
for name, val in attrs:
if name == 'href':
if val.find('.mov') != -1:
url = val
subPos = url.rfind('_')
if subPos == url.rfind('_h.'):
url = re.sub('(.*)/([^/]*)_h.([^/]*mov).*',r'\1/\2_h\3', url)
else:
url = re.sub('(.*)/([^/]*)_([^/]*mov).*',r'\1/\2_h\3', url)
url = re.sub('_hh','_h', url)
url = re.sub('h640','h640w', url)
logging.info("Found trailer url: " + url)
self.dirtyURLS.add(url)
elif val.startswith('includes'):
if val in self.parsedURLS:
continue
wip = WebIncParser(self.baseURL, val, self.parsedURLS)
self.parsedURLS = wip.parsedURLS
ret = wip.getTrailers()
if type(ret) == set:
self.dirtyURLS.update(ret)
else:
self.trailers = ret