def _parse_event(self, event):
        starts_at = times.to_universal(event.get('dtstart').dt)
        title_main = event.get('summary')
        titles = [title_main]

        title_orig = year = length = None
        tags = []

        match = self.desc_re.match(event.get('description'))
        if match:
            if match.group('title'):
                title_orig = match.group('title').strip()
                titles.append(title_orig)

            year = int(match.group('year'))
            length = int(match.group('min'))

            tags = [
                self.tags_map.get(t.strip())
                for t in match.group('tags').split(',')
            ]

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(
                title_main=title_main,
                title_orig=title_orig,
                titles=titles,
                year=year,
                length=length,
            ),
            starts_at=starts_at,
            tags=tags,
        )
    def _parse_row(self, row):
        starts_at = parsers.date_time_year(row[1].text_content(),
                                           row[2].text_content())

        title_main = row[3].text_content()
        title_orig = row[4].text_content()

        # TODO scrape tags according to new implementation of tags
        # presented in https://github.com/honzajavorek/zitkino.cz/issues/97
        tags = [
            self.tags_map.get(t)
            for t in (row[5].text_content(), row[6].text_content())
        ]

        url_booking = row[8].link()

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(
                title_main_scraped=title_main,
                title_orig_scraped=title_orig,
            ),
            starts_at=starts_at,
            tags={tag: None
                  for tag in tags if tag},
            url=self.url,
            url_booking=url_booking,
        )
Beispiel #3
0
    def _parse_row(self, row):
        starts_at = parsers.date_time_year(row[1].text_content(),
                                           row[2].text_content())

        title_main = row[3].text_content()
        title_orig = row[4].text_content()

        tags = [
            self.tags_map.get(t)
            for t in (row[5].text_content(), row[6].text_content())
        ]

        url_booking = row[8].link()
        price = parsers.price(row[7].text_content())

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(
                title_main=title_main,
                titles=[title_main, title_orig],
            ),
            starts_at=starts_at,
            tags=tags,
            url_booking=url_booking,
            price=price,
        )
Beispiel #4
0
    def _parse_event(self, event):
        starts_at = times.to_universal(event.get('dtstart').dt)
        title_main = event.get('summary')

        title_orig = year = length = None
        tags = []

        match = self.desc_re.match(event.get('description'))
        if match:
            if match.group('title'):
                title_orig = match.group('title').strip()

            year = int(match.group('year'))
            length = int(match.group('min'))

            # TODO scrape tags according to new implementation of tags
            # presented in https://github.com/honzajavorek/zitkino.cz/issues/97
            tags = [
                self.tags_map.get(t.strip())
                for t in match.group('tags').split(',')
            ]

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(
                title_main_scraped=title_main,
                title_orig_scraped=title_orig,
                year=year,
                length=length,
            ),
            starts_at=starts_at,
            tags={tag: None
                  for tag in tags if tag},
            url='http://kinonadobraku.cz',
        )
Beispiel #5
0
    def _parse_row(self, row, tags=None):
        movie_el = row.cssselect_first('.movie a:not(.tag)')
        url = movie_el.link()
        title = movie_el.text_content()

        date_el = row.cssselect_first('.date').text_content(whitespace=True)
        date, time = re.split(r'[\r\n]+', date_el)

        starts_at = times.to_universal(
            datetime.datetime.combine(
                parsers.date_cs(date),
                datetime.time(*[int(n) for n in time.split(':')])),
            'Europe/Prague')

        tags = self._parse_tags(row, tags)
        details = self._parse_details(url)

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(title_main_scraped=title,
                                     url=url,
                                     **details),
            starts_at=starts_at,
            tags=tags,
            url=self.url,
        )
Beispiel #6
0
    def _parse_row(self, row, date, url):
        """Takes single row and date information, returns
        :class:`~zitkino.models.Showtime` object.
        """
        st = Showtime(cinema=cinema, url=url)
        tags = {}

        for cell in row:
            if cell.has_class('col_time_reservation'):
                st.starts_at = self._parse_time(cell, date)
                st.url_booking = cell.link()

            if cell.has_class('col_movie_name'):
                info = self._parse_info(cell)
                st.film_scraped = ScrapedFilm(title_main_scraped=info.title,
                                              url=info.url,
                                              **self._parse_details(info.url))
                tags.update({tag: None for tag in info.tags})

            if cell.has_class('col_param_icons'):
                tags.update(self._parse_tags_from_icons(cell))

            if cell.has_class('col_cycle'):
                tags.update(self._parse_tags_from_cycles(cell))

        st.tags = tags
        return st
Beispiel #7
0
    def _parse_row(self, row, subrow, tags=None):
        elements = self._parse_subrow(subrow)

        title_el = elements.get('title')
        if title_el is None:
            return None
        title_main = title_el.text_content()
        if title_main in self.title_blacklist:
            return None

        starts_at = parsers.date_time_year(
            row.cssselect('.film_table_datum')[0].text_content(),
            subrow.cssselect('.cas')[0].text_content(),
        )

        booking_el = elements.get('booking')
        url_booking = booking_el.link() if booking_el is not None else None

        tags = tags or []
        tag_el = elements.get('tag')
        if tag_el is not None:
            tags.append(self.tags_map.get(tag_el.text_content()))

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(
                title_main=title_main,
                titles=[title_main],
            ),
            starts_at=starts_at,
            tags=tags,
            url_booking=url_booking,
        )
Beispiel #8
0
    def __call__(self):
        resp = self.session.get(self.url)
        html = parsers.html(resp.content, base_url=resp.url)

        for event in html.cssselect('.event'):
            header = event.cssselect_first('h2')

            url = header.link()
            title = header.text_content()

            title_parts = title.split('/')
            if len(title_parts) == 2:
                # naive, but for now good enough
                title_main, title_orig = title_parts
            else:
                title_main = title
                title_orig = None

            details = event.cssselect_first('.descshort').text_content()
            cat = event.cssselect_first('.title-cat').text_content().lower()

            tags = []
            for regexp, tag in self.tag_re:
                if regexp.search(title_main):
                    tags.append(tag)
                    title_main = regexp.sub('', title_main).strip()
                if title_orig and regexp.search(title_orig):
                    tags.append(tag)
                    title_orig = regexp.sub('', title_orig).strip()
                if regexp.search(details):
                    tags.append(tag)
            if cat != 'filmy':
                tags.append(cat)

            d = parsers.date_cs(event.cssselect_first('.nextdate strong').text)

            t = event.cssselect_first('.nextdate .evttime').text_content()
            t = time(*map(int, t.split(':')))

            starts_at = times.to_universal(datetime.combine(d, t), self.tz)

            yield Showtime(
                cinema=cinema,
                film_scraped=ScrapedFilm(
                    title_main_scraped=title_main,
                    title_orig=title_orig or None,
                ),
                starts_at=starts_at,
                url=url,
                url_booking=self.url_booking,
                tags={tag: None
                      for tag in tags},
            )
Beispiel #9
0
    def _parse_entry(self, entry):
        try:
            description = next(
                line for line
                in entry.text_content(whitespace=True).splitlines()
                if self.length_re.search(line)
            )
        except StopIteration:
            return None  # it's not a film

        date_el = entry.cssselect_first('h4 span')
        date = datetime.datetime(*reversed(
            [int(n) for n in date_el.text_content().split('.')]
        ))

        time_el = entry.cssselect_first('.start')
        time_match = self.time_re.search(time_el.text_content())
        time = datetime.time(
            int(time_match.group(1)),
            int(time_match.group(2)),
        )

        starts_at = times.to_universal(
            datetime.datetime.combine(date, time),
            'Europe/Prague'
        )
        title = date_el.tail

        tags = {}
        detail_data = {}

        details = [detail.strip() for detail in description.split(',')]
        for detail in details:
            if self.year_re.match(detail):
                detail_data['year'] = int(detail)

            match = self.length_re.match(detail)
            if match:
                detail_data['length'] = int(match.group(1))

            if 'tit.' in detail or 'titulky' in detail or 'dabing' in detail:
                tags[detail] = None

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(
                title_main_scraped=title,
                **detail_data
            ),
            starts_at=starts_at,
            tags=tags,
            url=self.url,
        )
Beispiel #10
0
    def _parse_row(self, day, row, labels):
        a = row.cssselect_first('a.featureLink')
        title = a.text_content()
        url = a.link()

        details = self._parse_details(url)
        tags = {}
        showtimes = []

        row = list(row.iterchildren())[1:]
        labels = list(labels.iterchildren())[1:]
        table = [(c.text_content(), l.text_content())
                 for (c, l) in zip(row, labels)]

        for cell, label in table:
            if label:
                if label == 'Min.':
                    details.setdefault('length', int(cell))
                elif cell != '---':
                    tags[cell] = label
            elif cell:
                showtimes.extend(cell.split())

        for regexp, tag in self.tag_re:
            if regexp.search(title):
                tags[tag] = None
                title = regexp.sub('', title).strip()

        for st in showtimes:
            starts_at = times.to_universal(
                datetime.datetime.combine(
                    day, datetime.time(*[int(n) for n in st.split(':')])),
                'Europe/Prague')

            yield Showtime(
                cinema=self.cinema,
                film_scraped=ScrapedFilm(title_main_scraped=title, **details),
                starts_at=starts_at,
                tags=tags,
                url='http://www.cinemacity.cz/',
            )
Beispiel #11
0
    def _parse_entry_text(self, title_text, details_text):
        """Takes HTML element with film header line and generates showtimes."""
        for title_text, dates_text in self._split_entry_text(title_text):
            info = self._parse_info(title_text, details_text)

            date_ranges = self._parse_date_ranges(dates_text)
            standalone_dates = self._parse_standalone_dates(dates_text)

            dates = list(date_ranges) + list(standalone_dates)
            for starts_at in dates:
                yield Showtime(
                    cinema=cinema,
                    film_scraped=ScrapedFilm(
                        title_main_scraped=info.title_main,
                        directors=info.directors,
                    ),
                    starts_at=starts_at,
                    url=self.url,
                    url_booking=self.url_booking,
                    tags={tag: None
                          for tag in info.tags},
                )
Beispiel #12
0
    def _parse_item(self, item):
        title_main = item.cssselect_first('.program-title').text_content()
        url = item.cssselect_first('.program-title').link()

        date_el = item.cssselect_first('.program-date').text_content()
        date, time = re.split(r'\s+ve?\s+', date_el)

        starts_at = times.to_universal(
            datetime.datetime.combine(
                parsers.date_cs(date),
                datetime.time(*[int(n) for n in time.split(':')])),
            'Europe/Prague')

        details = self._parse_details(url)

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(title_main_scraped=title_main,
                                     url=url,
                                     **details),
            starts_at=starts_at,
            url=self.url,
        )
Beispiel #13
0
    def _parse_row(self, row, subrow, tags=None):
        elements = self._parse_subrow(subrow)

        title_el = elements.get('title')
        if title_el is None:
            return None
        title_main = title_el.text_content()
        if title_main in self.title_blacklist:
            return None

        url = title_el.link()

        starts_at = parsers.date_time_year(
            row.cssselect('.film_table_datum')[0].text_content(),
            subrow.cssselect('.cas')[0].text_content(),
        )

        booking_el = elements.get('booking')
        url_booking = booking_el.link() if booking_el is not None else None

        tags = {tag: self.tags[tag] for tag in (tags or [])}
        tag_el = elements.get('tag')
        if tag_el is not None:
            tags.update([self._parse_tag(tag_el)])

        return Showtime(
            cinema=cinema,
            film_scraped=ScrapedFilm(
                title_main_scraped=title_main,
                url=url,
            ),
            starts_at=starts_at,
            tags=tags,
            url=self.url,
            url_booking=url_booking,
        )