def _parse_row(self, row, tags=None): movie_el = row.cssselect_first('.movie a:not(.tag)') url = movie_el.link() title = movie_el.text_content() date_el = row.cssselect_first('.date').text_content(whitespace=True) date, time = re.split(r'[\r\n]+', date_el) starts_at = times.to_universal(datetime.datetime.combine( parsers.date_cs(date), datetime.time(*[int(n) for n in time.split(':')]) ), 'Europe/Prague') tags = self._parse_tags(row, tags) details = self._parse_details(url) return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title, url=url, **details ), starts_at=starts_at, tags=tags, url=self.url, )
def _parse_row(self, row, tags=None): movie_el = row.cssselect_first('.movie a:not(.tag)') url = movie_el.link() title = movie_el.text_content() date_el = row.cssselect_first('.date').text_content(whitespace=True) date, time = re.split(r'[\r\n]+', date_el) starts_at = times.to_universal( datetime.datetime.combine( parsers.date_cs(date), datetime.time(*[int(n) for n in time.split(':')])), 'Europe/Prague') tags = self._parse_tags(row, tags) details = self._parse_details(url) return Showtime( cinema=cinema, film_scraped=ScrapedFilm(title_main_scraped=title, url=url, **details), starts_at=starts_at, tags=tags, url=self.url, )
def __call__(self): date = None for row in self._scrape_rows(): if row.element.has_class('day'): date = parsers.date_cs(row.element.text_content()) else: yield self._parse_row(row.element, date, row.url)
def __call__(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) for event in html.cssselect('.event'): header = event.cssselect_first('h2') url = header.link() title = header.text_content() title_parts = title.split('/') if len(title_parts) == 2: # naive, but for now good enough title_main, title_orig = title_parts else: title_main = title title_orig = None details = event.cssselect_first('.descshort').text_content() cat = event.cssselect_first('.title-cat').text_content().lower() tags = [] for regexp, tag in self.tag_re: if regexp.search(title_main): tags.append(tag) title_main = regexp.sub('', title_main).strip() if title_orig and regexp.search(title_orig): tags.append(tag) title_orig = regexp.sub('', title_orig).strip() if regexp.search(details): tags.append(tag) if cat != 'filmy': tags.append(cat) d = parsers.date_cs( event.cssselect_first('.nextdate strong').text ) t = event.cssselect_first('.nextdate .evttime').text_content() t = time(*map(int, t.split(':'))) starts_at = times.to_universal(datetime.combine(d, t), self.tz) yield Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, title_orig=title_orig or None, ), starts_at=starts_at, url=url, url_booking=self.url_booking, tags={tag: None for tag in tags}, )
def __call__(self): resp = self.session.get(self.url) html = parsers.html(resp.content, base_url=resp.url) for event in html.cssselect('.event'): header = event.cssselect_first('h2') url = header.link() title = header.text_content() title_parts = title.split('/') if len(title_parts) == 2: # naive, but for now good enough title_main, title_orig = title_parts else: title_main = title title_orig = None details = event.cssselect_first('.descshort').text_content() cat = event.cssselect_first('.title-cat').text_content().lower() tags = [] for regexp, tag in self.tag_re: if regexp.search(title_main): tags.append(tag) title_main = regexp.sub('', title_main).strip() if title_orig and regexp.search(title_orig): tags.append(tag) title_orig = regexp.sub('', title_orig).strip() if regexp.search(details): tags.append(tag) if cat != 'filmy': tags.append(cat) d = parsers.date_cs(event.cssselect_first('.nextdate strong').text) t = event.cssselect_first('.nextdate .evttime').text_content() t = time(*map(int, t.split(':'))) starts_at = times.to_universal(datetime.combine(d, t), self.tz) yield Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, title_orig=title_orig or None, ), starts_at=starts_at, url=url, url_booking=self.url_booking, tags={tag: None for tag in tags}, )
def _parse_item(self, item): title_main = item.cssselect_first('.program-title').text_content() url = item.cssselect_first('.program-title').link() date_el = item.cssselect_first('.program-date').text_content() date, time = re.split(r'\s+ve?\s+', date_el) starts_at = times.to_universal( datetime.datetime.combine( parsers.date_cs(date), datetime.time(*[int(n) for n in time.split(':')])), 'Europe/Prague') details = self._parse_details(url) return Showtime( cinema=cinema, film_scraped=ScrapedFilm(title_main_scraped=title_main, url=url, **details), starts_at=starts_at, url=self.url, )
def _parse_item(self, item): title_main = item.cssselect_first('.program-title').text_content() url = item.cssselect_first('.program-title').link() date_el = item.cssselect_first('.program-date').text_content() date, time = re.split(r'\s+ve?\s+', date_el) starts_at = times.to_universal(datetime.datetime.combine( parsers.date_cs(date), datetime.time(*[int(n) for n in time.split(':')]) ), 'Europe/Prague') details = self._parse_details(url) return Showtime( cinema=cinema, film_scraped=ScrapedFilm( title_main_scraped=title_main, url=url, **details ), starts_at=starts_at, url=self.url, )