def get_data(self): data = {"id": self.id} url = compose_url("viewMovie", {"id": self.id}) xml = read_url(url, None, ITUNES_HEADERS) f = open("/Users/rolux/Desktop/iTunesData.xml", "w") f.write(xml) f.close() data["actors"] = parse_cast(xml, "actors") string = find_re(xml, "Average Rating:(.*?)</HBoxView>") data["averageRating"] = string.count("rating_star_000033.png") + string.count("½") * 0.5 data["directors"] = parse_cast(xml, "directors") data["format"] = find_re(xml, "Format:(.*?)<") data["genre"] = decode_html(find_re(xml, "Genre:(.*?)<")) data["plotSummary"] = decode_html( find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>') ) data["posterUrl"] = find_re(xml, 'reflection="." url="(.*?)"') data["producers"] = parse_cast(xml, "producers") data["rated"] = find_re(xml, "Rated(.*?)<") data["relatedMovies"] = parse_movies(xml, "related movies") data["releaseDate"] = find_re(xml, "Released(.*?)<") data["runTime"] = find_re(xml, "Run Time:(.*?)<") data["screenwriters"] = parse_cast(xml, "screenwriters") data["soundtrackId"] = find_re(xml, "viewAlbum\?id=(.*?)&") data["trailerUrl"] = find_re(xml, 'autoplay="." url="(.*?)"') return data
def get_lyrics(title, artist): html = read_url('http://lyricsfly.com/api/') key = find_re(html, '<font color=green><b>(.*?)</b></font>') url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) xml = read_url(url) lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') lyrics = lyrics.replace('\n', '').replace('\r', '') lyrics = lyrics.replace('[br]', '\n').strip() lyrics.replace('\n\n\n', '\n\n') lyrics = decode_html(lyrics.replace('&', '&')) return lyrics
def parse_cast(xml, title): list = [] try: strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split( "</GotoURL>" ) strings.pop() for string in strings: list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) return list except: return list
def parse_xml_dict(xml): values = {} strings = xml.split("<key>") for string in strings: if string.find("</key>") != -1: key = find_re(string, "(.*?)</key>") type = find_re(string, "</key><(.*?)>") if type == "true/": value = True else: value = find_re(string, "<%s>(.*?)</%s>" % (type, type)) if type == "integer": value = int(value) elif type == "string": value = decode_html(value) values[key] = value return values
def parse_movies(xml, title): list = [] try: strings = find_re( xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper() ).split("</GotoURL>") strings.pop() for string in strings: list.append( { "id": find_re(string, "viewMovie\?id=(.*?)&"), "title": find_re( string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>' ), } ) return list except: return list
def get_id(url): split = url.split('/') year = split[3] split = split[4][:-5].split('_') if split[-1] == 'xlg': split.pop() if find_re(split[-1], 'ver\d+$'): split.pop() id = '%s/%s' % (year, '_'.join(split)) return id
def get_data(self): data = {"id": self.id} url = compose_url("viewAlbum", {"id": self.id}) xml = read_url(url, None, ITUNES_HEADERS) data["albumName"] = find_re(xml, "<B>(.*?)</B>") data["artistName"] = find_re(xml, "<b>(.*?)</b>") data["coverUrl"] = find_re(xml, 'reflection="." url="(.*?)"') data["genre"] = find_re(xml, "Genre:(.*?)<") data["releaseDate"] = find_re(xml, "Released(.*?)<") data["review"] = strip_tags( find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>') ) data["tracks"] = [] strings = find_re(xml, "<key>items</key>.*?<dict>(.*?)$").split("<dict>") for string in strings: data["tracks"].append(parse_xml_dict(string)) data["type"] = find_re(xml, "<key>listType</key><string>(.*?)<") return data
def get_ids(page=None): ids = [] if page: html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True) results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html) for result in results: url = 'http://impawards.com/%s' % result ids.append(get_id(url)) return set(ids) #get all html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True) pages = int(find_re(html, '<a href= page(.*?).html>')) + 1 for page in range(pages, 0, -1): for id in get_ids(page): if not id in ids: ids.append(id) return ids
def get_data(id): ''' >>> get_data('1991/silence_of_the_lambs')['imdbId'] u'0102926' >>> get_data('1991/silence_of_the_lambs')['posters'][0] u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg' >>> get_data('1991/silence_of_the_lambs')['url'] u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html' ''' data = { 'url': get_url(id) } html = read_url(data['url'], unicode=True) data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})') if not data['imdbId']: data['imdbId'] = _id_map.get(id, '') data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">')) data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)') data['posters'] = [] poster = find_re(html, '<img src="(posters.*?)"') if poster: poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster) data['posters'].append(poster) results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html) for result in results: result = result.replace('_xlg.html', '.html') url = 'http://www.impawards.com/%s/%s' % (data['year'], result) html = read_url(url, unicode=True) result = find_re(html, '<a href = (\w*?_xlg.html)') if result: url = 'http://www.impawards.com/%s/%s' % (data['year'], result) html = read_url(url, unicode=True) poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"')) else: poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"')) data['posters'].append(poster) return data
def get_id(self): url = compose_url("advancedSearch", {"media": "movie", "title": self.title, "director": self.director}) xml = read_url(url, headers=ITUNES_HEADERS) id = find_re(xml, "viewMovie\?id=(.*?)&") return id
def get_id(self): url = compose_url("advancedSearch", {"media": "music", "title": self.title, "artist": self.artist}) xml = read_url(url, headers=ITUNES_HEADERS) id = find_re(xml, "viewAlbum\?id=(.*?)&") return id
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' >>> get_data('1333').get('imdbId') u'0060304' >>> get_data('236')['posters'][0] u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' >>> get_data('786')['posters'][0] u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' ''' data = { "url": get_url(id) } try: html = read_url(data["url"], timeout=timeout, unicode=True) except: html = ox.cache.read_url(data["url"], timeout=timeout) data["number"] = find_re(html, "<li>Spine #(\d+)") data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>") data["title"] = data["title"].split(u' \u2014 The Television Version')[0] data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>")) results = find_re(html, '<div class="left_column">(.*?)</div>') results = re.compile("<li>(.*?)</li>").findall(results) data["country"] = results[0] data["year"] = results[1] data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>")) result = find_re(html, "<div class=\"purchase\">(.*?)</div>") if 'Blu-Ray' in result or 'Essential Art House DVD' in result: r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html) if r: result = r[0] result = find_re(result, "<a href=\"(.*?)\"") if not "/boxsets/" in result: data["posters"] = [result] else: html_ = read_url(result, unicode=True) result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id) result = find_re(result, "src=\"(.*?)\"") if result: data["posters"] = [result.replace("_w100", "")] else: data["posters"] = [] data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']] result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"") if result: data["stills"] = [result] data["trailers"] = [] else: data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")]) data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")]) if timeout == ox.cache.cache_timeout: timeout = -1 if get_imdb: # removed year, as "title (year)" may fail to match data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout) return data
def get_url(id): url = u"http://www.impawards.com/%s.html" % id html = read_url(url, unicode=True) if find_re(html, "No Movie Posters on This Page"): url = u"http://www.impawards.com/%s_ver1.html" % id return url