def _parse_event(self, event): starts_at = times.to_universal(event.get('dtstart').dt) title_main = event.get('summary') titles = [title_main] title_orig = year = length = None tags = [] match = self.desc_re.match(event.get('description')) if match: if match.group('title'): title_orig = match.group('title').strip() titles.append(title_orig) year = int(match.group('year')) length = int(match.group('min')) tags = [ self.tags_map.get(t.strip()) for t in match.group('tags').split(',') ] return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main=title_main, title_orig=title_orig, titles=titles, year=year, length=length, ), starts_at=starts_at, tags=tags, )
def _parse_row(self, row): starts_at = parsers.date_time_year(row[1].text_content(), row[2].text_content()) title_main = row[3].text_content() title_orig = row[4].text_content() # TODO scrape tags according to new implementation of tags # presented in https://github.com/honzajavorek/zitkino.cz/issues/97 tags = [ self.tags_map.get(t) for t in (row[5].text_content(), row[6].text_content()) ] url_booking = row[8].link() return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, title_orig_scraped=title_orig, ), starts_at=starts_at, tags={tag: None for tag in tags if tag}, url=self.url, url_booking=url_booking, )
def _parse_row(self, row): starts_at = parsers.date_time_year(row[1].text_content(), row[2].text_content()) title_main = row[3].text_content() title_orig = row[4].text_content() tags = [ self.tags_map.get(t) for t in (row[5].text_content(), row[6].text_content()) ] url_booking = row[8].link() price = parsers.price(row[7].text_content()) return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main=title_main, titles=[title_main, title_orig], ), starts_at=starts_at, tags=tags, url_booking=url_booking, price=price, )
def _parse_event(self, event): starts_at = times.to_universal(event.get('dtstart').dt) title_main = event.get('summary') title_orig = year = length = None tags = [] match = self.desc_re.match(event.get('description')) if match: if match.group('title'): title_orig = match.group('title').strip() year = int(match.group('year')) length = int(match.group('min')) # TODO scrape tags according to new implementation of tags # presented in https://github.com/honzajavorek/zitkino.cz/issues/97 tags = [ self.tags_map.get(t.strip()) for t in match.group('tags').split(',') ] return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, title_orig_scraped=title_orig, year=year, length=length, ), starts_at=starts_at, tags={tag: None for tag in tags if tag}, url='http://kinonadobraku.cz', )
def _parse_row(self, row, tags=None): movie_el = row.cssselect_first('.movie a:not(.tag)') url = movie_el.link() title = movie_el.text_content() date_el = row.cssselect_first('.date').text_content(whitespace=True) date, time = re.split(r'[\r\n]+', date_el) starts_at = times.to_universal( datetime.datetime.combine( parsers.date_cs(date), datetime.time(*[int(n) for n in time.split(':')])), 'Europe/Prague') tags = self._parse_tags(row, tags) details = self._parse_details(url) return Showtime( cinema=cinema, film_scraped=ScrapedFilm(title_main_scraped=title, url=url, **details), starts_at=starts_at, tags=tags, url=self.url, )
def _parse_row(self, row, date, url): """Takes single row and date information, returns :class:`~zitkino.models.Showtime` object. """ st = Showtime(cinema=cinema, url=url) tags = {} for cell in row: if cell.has_class('col_time_reservation'): st.starts_at = self._parse_time(cell, date) st.url_booking = cell.link() if cell.has_class('col_movie_name'): info = self._parse_info(cell) st.film_scraped = ScrapedFilm(title_main_scraped=info.title, url=info.url, **self._parse_details(info.url)) tags.update({tag: None for tag in info.tags}) if cell.has_class('col_param_icons'): tags.update(self._parse_tags_from_icons(cell)) if cell.has_class('col_cycle'): tags.update(self._parse_tags_from_cycles(cell)) st.tags = tags return st
def _parse_row(self, row, subrow, tags=None): elements = self._parse_subrow(subrow) title_el = elements.get('title') if title_el is None: return None title_main = title_el.text_content() if title_main in self.title_blacklist: return None starts_at = parsers.date_time_year( row.cssselect('.film_table_datum')[0].text_content(), subrow.cssselect('.cas')[0].text_content(), ) booking_el = elements.get('booking') url_booking = booking_el.link() if booking_el is not None else None tags = tags or [] tag_el = elements.get('tag') if tag_el is not None: tags.append(self.tags_map.get(tag_el.text_content())) return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main=title_main, titles=[title_main], ), starts_at=starts_at, tags=tags, url_booking=url_booking, )
def __call__(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) for event in html.cssselect('.event'): header = event.cssselect_first('h2') url = header.link() title = header.text_content() title_parts = title.split('/') if len(title_parts) == 2: # naive, but for now good enough title_main, title_orig = title_parts else: title_main = title title_orig = None details = event.cssselect_first('.descshort').text_content() cat = event.cssselect_first('.title-cat').text_content().lower() tags = [] for regexp, tag in self.tag_re: if regexp.search(title_main): tags.append(tag) title_main = regexp.sub('', title_main).strip() if title_orig and regexp.search(title_orig): tags.append(tag) title_orig = regexp.sub('', title_orig).strip() if regexp.search(details): tags.append(tag) if cat != 'filmy': tags.append(cat) d = parsers.date_cs(event.cssselect_first('.nextdate strong').text) t = event.cssselect_first('.nextdate .evttime').text_content() t = time(*map(int, t.split(':'))) starts_at = times.to_universal(datetime.combine(d, t), self.tz) yield Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, title_orig=title_orig or None, ), starts_at=starts_at, url=url, url_booking=self.url_booking, tags={tag: None for tag in tags}, )
def _parse_entry(self, entry): try: description = next( line for line in entry.text_content(whitespace=True).splitlines() if self.length_re.search(line) ) except StopIteration: return None # it's not a film date_el = entry.cssselect_first('h4 span') date = datetime.datetime(*reversed( [int(n) for n in date_el.text_content().split('.')] )) time_el = entry.cssselect_first('.start') time_match = self.time_re.search(time_el.text_content()) time = datetime.time( int(time_match.group(1)), int(time_match.group(2)), ) starts_at = times.to_universal( datetime.datetime.combine(date, time), 'Europe/Prague' ) title = date_el.tail tags = {} detail_data = {} details = [detail.strip() for detail in description.split(',')] for detail in details: if self.year_re.match(detail): detail_data['year'] = int(detail) match = self.length_re.match(detail) if match: detail_data['length'] = int(match.group(1)) if 'tit.' in detail or 'titulky' in detail or 'dabing' in detail: tags[detail] = None return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title, **detail_data ), starts_at=starts_at, tags=tags, url=self.url, )
def _parse_row(self, day, row, labels): a = row.cssselect_first('a.featureLink') title = a.text_content() url = a.link() details = self._parse_details(url) tags = {} showtimes = [] row = list(row.iterchildren())[1:] labels = list(labels.iterchildren())[1:] table = [(c.text_content(), l.text_content()) for (c, l) in zip(row, labels)] for cell, label in table: if label: if label == 'Min.': details.setdefault('length', int(cell)) elif cell != '---': tags[cell] = label elif cell: showtimes.extend(cell.split()) for regexp, tag in self.tag_re: if regexp.search(title): tags[tag] = None title = regexp.sub('', title).strip() for st in showtimes: starts_at = times.to_universal( datetime.datetime.combine( day, datetime.time(*[int(n) for n in st.split(':')])), 'Europe/Prague') yield Showtime( cinema=self.cinema, film_scraped=ScrapedFilm(title_main_scraped=title, **details), starts_at=starts_at, tags=tags, url='http://www.cinemacity.cz/', )
def _parse_entry_text(self, title_text, details_text): """Takes HTML element with film header line and generates showtimes.""" for title_text, dates_text in self._split_entry_text(title_text): info = self._parse_info(title_text, details_text) date_ranges = self._parse_date_ranges(dates_text) standalone_dates = self._parse_standalone_dates(dates_text) dates = list(date_ranges) + list(standalone_dates) for starts_at in dates: yield Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=info.title_main, directors=info.directors, ), starts_at=starts_at, url=self.url, url_booking=self.url_booking, tags={tag: None for tag in info.tags}, )
def _parse_item(self, item): title_main = item.cssselect_first('.program-title').text_content() url = item.cssselect_first('.program-title').link() date_el = item.cssselect_first('.program-date').text_content() date, time = re.split(r'\s+ve?\s+', date_el) starts_at = times.to_universal( datetime.datetime.combine( parsers.date_cs(date), datetime.time(*[int(n) for n in time.split(':')])), 'Europe/Prague') details = self._parse_details(url) return Showtime( cinema=cinema, film_scraped=ScrapedFilm(title_main_scraped=title_main, url=url, **details), starts_at=starts_at, url=self.url, )
def _parse_row(self, row, subrow, tags=None): elements = self._parse_subrow(subrow) title_el = elements.get('title') if title_el is None: return None title_main = title_el.text_content() if title_main in self.title_blacklist: return None url = title_el.link() starts_at = parsers.date_time_year( row.cssselect('.film_table_datum')[0].text_content(), subrow.cssselect('.cas')[0].text_content(), ) booking_el = elements.get('booking') url_booking = booking_el.link() if booking_el is not None else None tags = {tag: self.tags[tag] for tag in (tags or [])} tag_el = elements.get('tag') if tag_el is not None: tags.update([self._parse_tag(tag_el)]) return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, url=url, ), starts_at=starts_at, tags=tags, url=self.url, url_booking=url_booking, )