def parse_episodes(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/episodes") soup = BeautifulSoup(content) seasons = soup.find('dl', {'class': 'episodesTable'}) episodes_list = [] for element in seasons.children: if element.name == 'dt': h3 = element.next_element try: season_number = int(h3.text.split(" ")[1]) except: break if element.name == 'dd': li_episodes = element.find_all("li") for episode in li_episodes: episode_number = int( re.match(r'\d+', episode.contents[0].text).group()) date_str = episode.find('div', { 'class': 'countryPremiereDate' }).text episode_name = episode.find('div', {'class': 'title'}).text episode_date = datetime.datetime.strptime( date_str, '%d.%m.%Y') episodes_list.append({ 'season': season_number, 'number': episode_number, 'date': episode_date, 'name': episode_name }) return episodes_list
def parse_episodes(self): grabber = HTMLGrabber() content = grabber.retrieve( self.obj.url+"/episodes" ) soup = BeautifulSoup(content) seasons = soup.findAll("table") episodeList = [] for season in seasons: try: seasonNumber = season.find("h3").text.split() if seasonNumber[0] == 'sezon': seasonNumber = int(seasonNumber[1]) else: seasonNumber = 0 except: seasonNumber = 0 episodes = season.findAll("td") for i in range(0, len(episodes), 3): number = episodes[i].text.split() if number[0] == 'odcinek': number = int(number[1]) else: number = 0 episodeDate = get_datetime_or_none(episodes[i+1].find('div')) episodeName = episodes[i+2].text episodeList.append({'season':seasonNumber, 'number':number, 'date':episodeDate, 'name': episodeName}) return episodeList
def _search_movie(self, title, results, genre_id, search_type, start_year, end_year): """Return list of movies""" grabber = HTMLGrabber() li_list = [] img_list = [] params = {"q": title.encode("utf-8"), "page": 1} if genre_id: params['genreIds'] = genre_id if start_year: params['startYear'] = start_year if end_year: params['endYear'] = end_year search_url = "" if search_type: search_url = "/" + search_type url = filmweb_search_blank + search_url + "?" + urlencode(params) content = grabber.retrieve(url) # @Make search more pages not only 1 soup = BeautifulSoup(content) li_list.extend(soup.findAll('div', {'class': 'hitDescWrapper'})) img_list.extend(soup.findAll('div', {'class': 'hitImage'})) for i, li in enumerate(li_list): a = li.find('a', {'class': 'hdr hdr-medium hitTitle'}) title = a.text url = a['href'] # have to do another check because sometimes url doesnt provide movieID aimg = img_list[i].find('a') if aimg is not None: img = aimg.find("img") movieID = get_real_id(url, img['src']) yield movieID, title, url
def _search_filtered_movie(self, title, results, genre_id, search_type): grabber = HTMLGrabber() params = {} params['page'] = 1 if title: params['q'] = title.encode("utf-8") if genre_id: params['genreIds'] = genre_id search_url = "" if search_type: search_url = "/" + search_type url = filmweb_search_blank + search_url + "?" + urllib.urlencode(params) content = grabber.retrieve(url) soup = BeautifulSoup(content) hits = soup.findAll('li', {'id': re.compile('hit_([0-9]*)')}) for hit in hits: h3 = hit.find("h3") url = h3.find("a")['href'] div_img = hit.find("div", {'class': 'filmPoster-1'}) img = div_img.find("img") movieID = get_real_id(url, img['src']) yield movieID, title, url
def parse_posters(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/posters") soup = BeautifulSoup(content) photoList = soup.find("ul", 'block-list postersList') images = [] for photo in photoList("img", {'class': "lbProxy"}): images.append({'href': photo['src'].replace(".2.jpg", '.3.jpg'), 'thumb': photo['src']}) return images
def parse_photos(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/photos") soup = BeautifulSoup(content) photos_list = soup.find("ul", {'class','photosList'}) images = [] for photo in photos_list.findAll("img"): images.append({'href': photo.parent['href'], 'thumb': photo['src'], 'image': photo.parent['data-photo']}) return images
def parse_cast(self): personList = [] for url in ["/cast/actors", "/cast/crew"]: grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + url) soup = BeautifulSoup(content) for filmCastBox in soup.findAll("div", {'class': 'filmCastBox'}): personType = filmCastBox.previous personTypesChange = { 'obsada': 'aktor', 'scenariusz': 'scenarzysta', 'produkcja': 'producent' } # backward compatibility if personType in personTypesChange: # personType = personTypesChange[personType] for cast in filmCastBox.findAll('tr', id=re.compile("role_")): url_html = cast.find("a", {'class': 'pImg49'}) url = url_html['href'] img_html = url_html.find("img") pattern_images = [ "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*).([0-3]*).jpg", "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*)_1.([0-3]*).jpg" ] pattern_link = "/person/(.+)-(?P<id>[0-9]*)" id = 0 results = re.search(pattern_link, url_html['href']) if results: id = results.group("id") else: for pattern in pattern_images: results = re.search(pattern, repr(img_html.extract())) if results: id = results.group("id") role_html = cast.find('a', {'rel': 'v:starring'}) role = role_html.parent.nextSibling.nextSibling.text name = role_html.parent.nextSibling.text personList.append( Person(id, title=name, roleType=personType, roleName=role, url=url)) return personList
def get_list_genres(): grabber = HTMLGrabber() content = grabber.retrieve(filmweb_search_blank + "/film") soup = BeautifulSoup(content) genres = soup.findAll('input', {'name': 'genreIds'}) list_genre = [] for genre in genres: genre_id = genre.attrs['value'] genre_name = genre.next_element.next_element.text list_genre.append({'genre_id': genre_id, 'genre_name': genre_name}) return list_genre
def parse_posters(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/posters") soup = BeautifulSoup(content) photoList = soup.find("ul", 'block-list postersList') images = [] for photo in photoList("img", {'class': "lbProxy"}): images.append({ 'href': photo['src'].replace(".2.jpg", '.3.jpg'), 'thumb': photo['src'] }) return images
def parse_photos(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/photos") soup = BeautifulSoup(content) photos_list = soup.find("ul", {'class', 'photosList'}) images = [] for photo in photos_list.findAll("img"): images.append({ 'href': photo.parent['href'], 'thumb': photo['src'], 'image': photo.parent['data-photo'] }) return images
def parse_cast(self): personList = [] for url in ["/cast/actors", "/cast/crew"]: grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + url) soup = BeautifulSoup(content) for filmCastBox in soup.findAll("div", {'class': 'filmCastBox'}): personType = filmCastBox.previous personTypesChange = {'obsada': 'aktor', 'scenariusz': 'scenarzysta', 'produkcja': 'producent'} # backward compatibility if personType in personTypesChange: # personType = personTypesChange[personType] for cast in filmCastBox.findAll('tr', id=re.compile("role_")): url_html = cast.find("a", {'class': 'pImg49'}) url = url_html['href'] img_html = url_html.find("img") pattern_images = [ "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*).([0-3]*).jpg", "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*)_1.([0-3]*).jpg" ] pattern_link = "/person/(.+)-(?P<id>[0-9]*)" id = 0 results = re.search(pattern_link, url_html['href']) if results: id = results.group("id") else: for pattern in pattern_images: results = re.search(pattern, repr(img_html.extract())) if results: id = results.group("id") role_html = cast.find('a', {'rel': 'v:starring'}) role = role_html.parent.nextSibling.nextSibling.text name = role_html.parent.nextSibling.text personList.append(Person(id, title=name, roleType=personType, roleName=role, url=url)) return personList
def _search_movie(self,title,results,): """Return list of movies""" grabber = HTMLGrabber() p_title = grabber.encode_string(title) li_list = [] img_list = [] #for type in ['film','serial']: content = grabber.retrieve(filmweb_search % (p_title,1)) #@Make search more pages not only 1 soup = BeautifulSoup(content) li_list.extend( soup.findAll('div', {'class':'hitDescWrapper'}) ) img_list.extend( soup.findAll('div', {'class':'hitImage'}) ) for i, li in enumerate(li_list): a = li.find('a',{'class':re.compile('hdr.*')}) # class="hdr hdr-medium hitTitle" for now title = a.text url = a['href'] # have to do another check because sometimes url doesnt provide movieID aimg = img_list[i].find('a') if aimg is not None: img = aimg.find("img") movieID = get_real_id(url,img['src']) yield movieID,title,url
def _search_person(self, title, results=20): # http://www.filmweb.pl/search/person?q=Tom+Cruise """Return list of persons""" grabber = HTMLGrabber() p_title = grabber.encode_string(title) li_list = [] img_list = [] content = grabber.retrieve(filmweb_person_search % (p_title, 1)) #@Make search more pages not only 1 soup = BeautifulSoup(content) li_list.extend(soup.findAll('div', {'class': 'hitDescWrapper'})) img_list.extend(soup.findAll('div', {'class': 'hitImage'})) for i, li in enumerate(li_list): a = li.find('a', {'class': 'hdr hdr-medium hitTitle'}) title = a.text url = a['href'] # have to do another check because sometimes url doesnt provide movieID aimg = img_list[i].find('a') if aimg is not None: img = aimg.find('img') personID = get_real_id(url, img['src']) yield personID, title, url
def parse_episodes(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/episodes") soup = BeautifulSoup(content) seasons = soup.find('dl', {'class': 'episodesTable'}) episodes_list = [] for element in seasons.children: if element.name == 'dt': h3 = element.next_element try: season_number = int(h3.text.split(" ")[1]) except: break if element.name == 'dd': li_episodes = element.find_all("li") for episode in li_episodes: episode_number = int(re.match(r'\d+', episode.contents[0].text).group()) date_str = episode.find('div',{'class':'countryPremiereDate'}).text episode_name = episode.find('div',{'class':'title'}).text episode_date = datetime.datetime.strptime(date_str,'%d.%m.%Y') episodes_list.append({'season': season_number, 'number': episode_number, 'date': episode_date, 'name': episode_name}) return episodes_list
def parse_cast(self): grabber = HTMLGrabber() content = grabber.retrieve( self.obj.url+"/cast" ) soup = BeautifulSoup(content) castList = soup.find("div",{'class':'filmSubpageContentWrapper'}) castGroups = castList.findAll(["dd","dt"]) personList = [] for cast in castGroups: if cast.name == 'dt': roleType = cast.text.split("/")[0].strip() roleType elif cast.name == 'dd': castList = cast.findAll("li") for person in castList: try: role = person.find("span",{'class':'roleName'}).text except AttributeError: role = None name = person.find("span",{'class':'personName'}).text patternlink = "/person/(.+)-(?P<id>[0-9]*)" patternimg = "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*).([0-3]*).jpg" href = person.find("span",{'class':'personName'}).find("a")['href'] results = re.search(patternlink,href) if results: id = results.group("id") else: results = re.search(patternimg,unicode(person.extract())) id = results.group("id") personList.append( Person(id,title=name,roleType=roleType,roleName=role,url=href) ) return personList
import re import urllib from bs4 import BeautifulSoup from filmweb.parser.HTMLGrabber import HTMLGrabber from filmweb.vars import filmweb_search_blank grabber = HTMLGrabber() params = {} params['page'] = 1 title = None genre_id = 2 search_type = "film" if title: params['q'] = title.encode("utf-8") if genre_id: params['genreIds'] = genre_id search_url = "" if search_type: search_url = "/" + search_type url = filmweb_search_blank + search_url + "?" + urllib.urlencode(params) content = grabber.retrieve(url) soup = BeautifulSoup(content) hits = soup.find(id="resultsCount") all = hits.get_text() p = re.compile('z \d+') all = p.findall(all) all = all[0]
def parse_cast(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/cast/actors") soup = BeautifulSoup(content) cast_list_table = soup.find("table", {'class': 'filmCast'}) cast_list = cast_list_table.find("tbody") personList = [] for cast in cast_list_table.findAll('tr',id=re.compile("role_")): url_html = cast.find("a",{'class':'pImg49'}) url = url_html['href'] img_html = url_html.find("img") pattern_img = "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*).([0-3]*).jpg" pattern_link = "/person/(.+)-(?P<id>[0-9]*)" results = re.search(pattern_link, url_html['href']) if results: id = results.group("id") else: results = re.search(pattern_img, unicode(img_html.extract())) id = results.group("id") role_html = cast.find('a',{'rel':'v:starring'}) role = role_html.parent.nextSibling.nextSibling.text name = role_html.parent.nextSibling.text personList.append(Person(id, title=name, roleType='aktor', roleName=role, url=url)) if cast.name == 'dt': roleType = cast.text.split("/")[0].strip() roleType elif cast.name == 'dd': castList = cast.findAll("li") for person in castList: try: role = person.find("span", {'class': 'roleName'}).text except AttributeError: role = None name = person.find("span", {'class': 'personName'}).text patternlink = "/person/(.+)-(?P<id>[0-9]*)" patternimg = "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*).([0-3]*).jpg" href = person.find("span", {'class': 'personName'}).find("a")['href'] results = re.search(patternlink, href) if results: id = results.group("id") else: results = re.search(patternimg, unicode(person.extract())) id = results.group("id") personList.append(Person(id, title=name, roleType=roleType, roleName=role, url=href)) return personList
def _download_content(self, url): from filmweb.parser.HTMLGrabber import HTMLGrabber grabber = HTMLGrabber() self._content = grabber.retrieve(url) self._soup = BeautifulSoup(self.content)