def parse_film(self, soup): belka = soup.find(ct('ps_belka')) title_pl = tag_text(belka.find('h2')) title = tag_text(belka.find('h4')) rows = soup.findAll(ct('row')) def _row(r): p = tag_text(r).split(':', 1) if len(p) == 2: return p[0].strip(), p[1].strip() else: return None, None props = dict(_row(row) for row in rows) screenings = soup.find(ct('pokazy')) screenings = screenings and screenings.findAll('li')[1:] screenings = screenings and [tag_text(s) for s in screenings] or [] img_url = soup.find(ct('ps_body')).find('img') img_url = img_url and img_url.parent['href'] return { 'title':title, 'title_localized':title_pl, # 'props':props, 'directors':filter(bool, [i.strip() for i in props.get(u'Reżyser', '').split(',')]), 'times':[self.parse_t(t) for t in screenings], 'year':props.get('Rok produkcji', None), 'img_url':img_url, }
def parse_t(self, t): date = tag_text(t.find(ct('perf_date'))) day = re.match("\w+\s*(\d+)", date).group(1) h, m = re.match("(\d+):(\d+)", tag_text(t.find(ct('perf_time')))).groups() venue = tag_text(t.find(ct('perf_venue'))).strip() theater = self.get_theater(venue) if not theater: logger.warning('no theater found for %r', venue) return t = self.get_screening_datetime(h, m, day) return theater, pytz.timezone(theater.timezone_id).localize(t), venue
def films(self): for i in range(20111012, 20111028): path = '/lff/calendar/%d' % i soup = self.get_soup(path) films = soup.findAll(ct('calendar-teaser-timeblock')) for f in films: showtimes = f.findAll(ct('perf_row')) times = [self.parse_t(t) for t in showtimes] times = [t for t in times if t] details = self.get_soup(f.find(ct('show_title')).find('a')['href']) parsed = self.parse_film(details) parsed['times'] = times title = parsed.get('title') if title: yield parsed
def films(self): for i in range(20111012, 20111028): path = '/lff/calendar/%d' % i soup = self.get_soup(path) films = soup.findAll(ct('calendar-teaser-timeblock')) for f in films: showtimes = f.findAll(ct('perf_row')) times = [self.parse_t(t) for t in showtimes] times = [t for t in times if t] details = self.get_soup( f.find(ct('show_title')).find('a')['href']) parsed = self.parse_film(details) parsed['times'] = times title = parsed.get('title') if title: yield parsed
def parse_film(self, soup): title = soup.find(id="header-one-films") title = title and tag_text(title.find('h1')) props = soup.findAll(ct('screening-with-credits-item')) props = dict((tag_text(p.find(ct('screening-with-credits-left'))), tag_text(p.find(ct('screening-with-credits-right')))) for p in props) directors = [i.strip() for i in props.get('Director', '').split(',')] synopsis = tag_text(soup.find(ct('program-item-alternatetitle'))) try: year = props.get('Year') year = year and int(year) or None except ValueError: year = None return { 'title': title, 'directors': filter(bool, directors), 'year': year, 'synopsis': synopsis, }
def films(self): for part in ['special'] + [chr(ord('a') + i) for i in range(26)]: for subpage in range(10): path = '/filmy/wszystkie/%s/%s/' % (part, subpage) soup = self.get_soup(path) films = soup.findAll(ct('nowina')) if films: for f in films: url = '/' + f.find('a')['href'] details = self.get_soup(url) yield self.parse_film(details) else: break
def parse_film(self, soup): title = soup.find(id="header-one-films") title = title and tag_text(title.find('h1')) props = soup.findAll(ct('screening-with-credits-item')) props = dict(( tag_text(p.find(ct('screening-with-credits-left'))), tag_text(p.find(ct('screening-with-credits-right'))) ) for p in props) directors = [i.strip() for i in props.get('Director', '').split(',')] synopsis = tag_text(soup.find(ct('program-item-alternatetitle'))) try: year = props.get('Year') year = year and int(year) or None except ValueError: year = None return { 'title': title, 'directors':filter(bool, directors), 'year':year, 'synopsis':synopsis, }
def parse_film(self, soup): belka = soup.find(ct('ps_belka')) title_pl = tag_text(belka.find('h2')) title = tag_text(belka.find('h4')) rows = soup.findAll(ct('row')) def _row(r): p = tag_text(r).split(':', 1) if len(p) == 2: return p[0].strip(), p[1].strip() else: return None, None props = dict(_row(row) for row in rows) screenings = soup.find(ct('pokazy')) screenings = screenings and screenings.findAll('li')[1:] screenings = screenings and [tag_text(s) for s in screenings] or [] img_url = soup.find(ct('ps_body')).find('img') img_url = img_url and img_url.parent['href'] return { 'title': title, 'title_localized': title_pl, # 'props':props, 'directors': filter(bool, [i.strip() for i in props.get(u'Reżyser', '').split(',')]), 'times': [self.parse_t(t) for t in screenings], 'year': props.get('Rok produkcji', None), 'img_url': img_url, }