def scrapeShowInfo(url, owned=False): tags = re.compile(r'<.*?>') scripts = re.compile(r'<script.*?script>', re.DOTALL) spaces = re.compile(r'\s+') data = common.getURL(url) data = scripts.sub('', data) style = re.compile(r'<style.*?style>', re.DOTALL) data = style.sub('', data) tree = BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) try: season = int( tree.find('div', attrs={ 'class': 'unbox_season_selected' }).string) except: try: season = int( tree.find( 'div', attrs={ 'style': 'font-size: 120%;font-weight:bold; margin-top:15px;margin-bottom:10px;' }).contents[0].split('Season')[1].strip()) except: season = None episodes = [] episodebox = tree.find('div', attrs={'id': 'avod-ep-list-rows'}) if episodebox == None: print tree.pretiffy() episodecount = None else: episodebox = tree.find( 'div', attrs={ 'id': 'avod-ep-list-rows' }).findAll( 'tr', attrs={'asin': True}) episodecount = len(episodebox) for episode in episodebox: if owned: purchasecheckbox = episode.find( 'input', attrs={'type': 'checkbox'}) if purchasecheckbox: continue episodeASIN = episode['asin'] episodetitle = episode.find( attrs={'title': True})['title'].encode('utf-8') if '[HD]' in episodetitle: episodetitle.replace('[HD]', '').strip() isHD = True else: isHD = False airDate = episode.find(attrs={ 'style': 'width: 150px; overflow: hidden' }).string.strip() try: plot = episode.findAll('div')[1].string.strip() except: plot = '' try: episodeNum = int( episode.find('div', attrs={ 'style': 'width: 185px;' }).string.split('.')[0].strip()) except: episodeNum = int( episode.find('div', attrs={ 'style': 'width: 185px;' }).contents[0].split('.')[0].strip()) url = common.BASE_URL + '/gp/product/' + episodeASIN episodedata = [ episodeASIN, season, episodeNum, episodetitle, url, plot, airDate, isHD ] episodes.append(episodedata) del episodebox try: stardata = tree.find( 'span', attrs={ 'class': 'crAvgStars' }).renderContents() stardata = scripts.sub('', stardata) stardata = tags.sub('', stardata) stardata = spaces.sub(' ', stardata).strip().split('out of ') stars = float(stardata[0]) * 2 votes = stardata[1].split('customer reviews')[0].split( 'See all reviews')[1].replace('(', '').strip() except: stars = None votes = None metadatas = tree.findAll( 'div', attrs={'style': 'margin-top:7px;margin-bottom:7px;'}) del tree, data metadict = {} for metadata in metadatas: mdata = metadata.renderContents() mdata = scripts.sub('', mdata) mdata = tags.sub('', mdata) mdata = spaces.sub(' ', mdata).strip().split(': ') fd = '' for md in mdata[1:]: fd += md + ' ' metadict[mdata[0].strip()] = fd.strip() try: plot = metadict['Synopsis'] except: plot = None try: creator = metadict['Creator'] except: creator = None try: runtime = metadict['Runtime'] if 'hours' in runtime: split = 'hours' elif 'hour' in runtime: split = 'hour' if 'minutes' in runtime: replace = 'minutes' elif 'minute' in runtime: replace = 'minute' if 'hour' not in runtime: runtime = runtime.replace(replace, '') minutes = int(runtime.strip()) elif 'minute' not in runtime: runtime = runtime.replace(split, '') minutes = (int(runtime.strip()) * 60) else: runtime = runtime.replace(replace, '').split(split) try: minutes = (int(runtime[0].strip()) * 60) + int( runtime[1].strip()) except: minutes = (int(runtime[0].strip()) * 60) runtime = str(minutes) except: runtime = None try: year = int(metadict['Season year']) except: year = None try: network = metadict['Network'] except: network = None try: actors = metadict['Starring'] + ', ' + metadict['Supporting actors'] except: try: actors = metadict['Starring'] except: actors = None try: genres = metadict['Genre'] except: genres = None print metadict showdata = [ season, episodecount, plot, creator, runtime, year, network, actors, genres, stars, votes ] return showdata, episodes
def scrapeShowInfo(url,owned=False): tags = re.compile(r'<.*?>') scripts = re.compile(r'<script.*?script>',re.DOTALL) spaces = re.compile(r'\s+') data = common.getURL(url) data = scripts.sub('', data) style = re.compile(r'<style.*?style>',re.DOTALL) data = style.sub('', data) tree = BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) try:season = int(tree.find('div',attrs={'class':'unbox_season_selected'}).string) except: try:season = int(tree.find('div',attrs={'style':'font-size: 120%;font-weight:bold; margin-top:15px;margin-bottom:10px;'}).contents[0].split('Season')[1].strip()) except:season = None episodes = [] episodebox = tree.find('div',attrs={'id':'avod-ep-list-rows'}) if episodebox == None: print tree.pretiffy() episodecount = None else: episodebox = tree.find('div',attrs={'id':'avod-ep-list-rows'}).findAll('tr',attrs={'asin':True}) episodecount = len(episodebox) for episode in episodebox: if owned: purchasecheckbox = episode.find('input',attrs={'type':'checkbox'}) if purchasecheckbox: continue episodeASIN = episode['asin'] episodetitle = episode.find(attrs={'title':True})['title'].encode('utf-8') if '[HD]' in episodetitle: episodetitle.replace('[HD]','').strip() isHD = True else: isHD = False airDate = episode.find(attrs={'style':'width: 150px; overflow: hidden'}).string.strip() try: plot = episode.findAll('div')[1].string.strip() except: plot = '' try:episodeNum = int(episode.find('div',attrs={'style':'width: 185px;'}).string.split('.')[0].strip()) except:episodeNum = int(episode.find('div',attrs={'style':'width: 185px;'}).contents[0].split('.')[0].strip()) url = common.BASE_URL+'/gp/product/'+episodeASIN episodedata = [episodeASIN,season,episodeNum,episodetitle,url,plot,airDate,isHD] episodes.append(episodedata) del episodebox try: stardata = tree.find('span',attrs={'class':'crAvgStars'}).renderContents() stardata = scripts.sub('', stardata) stardata = tags.sub('', stardata) stardata = spaces.sub(' ', stardata).strip().split('out of ') stars = float(stardata[0])*2 votes = stardata[1].split('customer reviews')[0].split('See all reviews')[1].replace('(','').strip() except: stars = None votes = None metadatas = tree.findAll('div', attrs={'style':'margin-top:7px;margin-bottom:7px;'}) del tree, data metadict = {} for metadata in metadatas: mdata = metadata.renderContents() mdata = scripts.sub('', mdata) mdata = tags.sub('', mdata) mdata = spaces.sub(' ', mdata).strip().split(': ') fd = '' for md in mdata[1:]: fd += md+' ' metadict[mdata[0].strip()] = fd.strip() try:plot = metadict['Synopsis'] except: plot = None try:creator = metadict['Creator'] except:creator = None try: runtime = metadict['Runtime'] if 'hours' in runtime: split = 'hours' elif 'hour' in runtime: split = 'hour' if 'minutes' in runtime: replace = 'minutes' elif 'minute' in runtime: replace = 'minute' if 'hour' not in runtime: runtime = runtime.replace(replace,'') minutes = int(runtime.strip()) elif 'minute' not in runtime: runtime = runtime.replace(split,'') minutes = (int(runtime.strip())*60) else: runtime = runtime.replace(replace,'').split(split) try: minutes = (int(runtime[0].strip())*60)+int(runtime[1].strip()) except: minutes = (int(runtime[0].strip())*60) runtime = str(minutes) except: runtime = None try: year = int(metadict['Season year']) except: year = None try: network = metadict['Network'] except: network = None try: actors = metadict['Starring']+', '+metadict['Supporting actors'] except: try: actors = metadict['Starring'] except: actors = None try: genres = metadict['Genre'] except: genres = None print metadict showdata = [season,episodecount,plot,creator,runtime,year,network,actors,genres,stars,votes] return showdata, episodes