def parse(self, page): self.html = fromstring(page) alternative_span = self.html.xpath('//span[@itemprop="alternateName"]') if len(alternative_span) > 0: self.alternative_name = alternative_span[0].text_content() for tr in self.html.xpath('//table[@class="info"]//tr'): td = tr.xpath('.//td[@class="type"]')[0] info_type = td.text_content() info = td.getnext().text_content() if info_type == u'дата рождения': self.birth_date = tr.xpath('.//td[@class="birth"]')[0].get( "birthdate") if self.birth_date is not None and self.birth_date.startswith( '-'): self.birth_date = ('%s BC' % self.birth_date[1:]) logger.warning('Birth date = %s' % self.birth_date) elif info_type == u'место рождения': self.birth_place = info elif info_type == u'рост': m = re.search('(\d+)\.(\d+) м', info, re.UNICODE) if m is not None: self.growth = int(m.group(1)) * 100 + int(m.group(2)) elif info_type == u'дата смерти': logger.warning(info) m = re.search(u'^(.+)•', info, re.UNICODE) if m is None: date = get_date(info.strip()).get('date') else: date = get_date(m.group(1).strip()).get('date') logger.warning('date = %s' % date) self.death_date = date elif info_type == u'место смерти': self.death_place = info
def get_dates(self): page = self.get_page('https://www.kinopoisk.ru/film/%s/dates/' % self.full_id) if page is None: logger.warning('There is no information about dates') return html = fromstring(page) for div in html.xpath('//table//tr//div[contains(@class, "flag")]'): td_date = div.getparent().getnext() td_country = td_date.getnext().xpath('.//a[contains(@class, "all")]') td_small = td_date.getnext().xpath('.//small') td_count = td_date.getnext().getnext().xpath('.//small') date = get_date(td_date[0].text_content().strip()) country = td_country[0].text_content() country_id = self.extract_country_id_from_url(td_country[0].get('href')) small = td_small[0].text_content().strip() m = re.search(u'(.+)чел.', td_count[0].text_content(), re.UNICODE) try: count = re.sub('[^\d]', '', m.group(1)) count = int(count) except (AttributeError, ValueError): count = None if country_id not in [i['id'] for i in self.countries]: self.countries_to_save.append({'id': country_id, 'name': country}) self.dates.append({'date': date, 'country_id': country_id, 'commentary': small, 'viewers': count})
def get_premieres(self, elem): div = elem.xpath('.//div[@class="prem_ical"]') if div is not None and len(div) > 0: date = get_date(div[0].get('data-ical-date').strip()) premiere = {'region': div[0].get('data-ical-type')} premiere.update(date) self.premieres.append(premiere) if premiere['region'] == 'world': self.world_premiere = date['date']