Beispiel #1
0
    def parse(self, instance, content):

        id = re.compile(r"<link rel=\"canonical\" href=\"http://www.kinopoisk.ru/name/(\d+)/\" />").findall(content)
        if id:
            instance.id = self.prepare_int(id[0])

        name = re.compile(r'<h1 class="moviename-big" itemprop="name">(.+?)</h1>').findall(content)
        if name:
            instance.name = self.prepare_str(name[0])

        name_original = re.compile(r'<span itemprop="alternativeHeadline">([\w\s]+)\s+</span>').findall(content)
        if name_original:
            instance.name_original = self.prepare_str(name_original[0])

        content_info = re.compile(r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>').findall(content)
        for name, value in content_info:
            if name.encode('utf-8') == 'дата рождения':
                year_birth = re.compile(r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>').findall(value)
                if year_birth:
                    instance.year_birth = self.prepare_int(year_birth[0])

        if instance.id:
            response = get_request(instance.get_url('info'))
            if response.content:
                instance.information = response.content.decode('windows-1251', 'ignore').replace(' class="trivia"', '')

        instance.set_source('main_page')
Beispiel #2
0
    def parse(self, instance, content):

        id = re.compile(r"<link rel=\"canonical\" href=\"http://www.kinopoisk.ru/name/(\d+)/\" />").findall(content)
        if id:
            instance.id = self.prepare_int(id[0])

        name = re.compile(r'<h1 style="padding:0px;margin:0px" class="moviename-big">(.+?)</h1>').findall(content)
        if name:
            instance.name = self.prepare_str(name[0])

        name_original = re.compile(r'<span style="font-size:13px;color:#666">(.+?)</span>').findall(content)
        if name_original:
            instance.name_original = self.prepare_str(name_original[0])

        content_info = content[content.find(u'<!-- инфа об актере -->'):content.find(u'<!-- /инфа об актере -->')]
        content_info = re.compile(r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>').findall(content_info)
        for name, value in content_info:
            if name == u'дата рождения':
                year_birth = re.compile(r'<a href="/level/10/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>').findall(value)
#                year_birth = re.compile(r'<a href="/level/10/m_act\[birthday\]\[year\]/\d{4}/">(\d{4})</a>').findall(value)
                if year_birth:
                    instance.year_birth = self.prepare_int(year_birth[0])

        if instance.id:
            response = get_request(instance.get_url('info'))
            if response.content:
                instance.information = response.content.decode('windows-1251', 'ignore').replace(' class="trivia"','')

        instance.set_source('main_page')
Beispiel #3
0
    def all(self):
        from BeautifulSoup import BeautifulSoup

        url, params = self.get_url_with_params()
        response = get_request(url, params=params)
        content = response.content.decode('windows-1251', 'ignore')

        content_soup = BeautifulSoup(content)
        instances = []
        for premier in content_soup.findAll('div', {'class': 'premier_item'}):
            instance = self.kinopoisk_object()
            instance.parse('premier_link', premier)
            instances += [instance]

        return instances
Beispiel #4
0
    def all(self):
        from BeautifulSoup import BeautifulSoup

        url, params = self.get_url_with_params()
        response = get_request(url, params=params)
        content = response.content.decode('windows-1251', 'ignore')

        content_soup = BeautifulSoup(content)
        instances = []
        for premier in content_soup.findAll('div', {'class': 'premier_item'}):
            instance = self.kinopoisk_object()
            instance.parse('premier_link', premier)
            instances += [instance]

        return instances
Beispiel #5
0
    def parse(self, instance, content):

        id = re.compile(
            r"<link rel=\"canonical\" href=\"http://www.kinopoisk.ru/name/(\d+)/\" />"
        ).findall(content)
        if id:
            instance.id = self.prepare_int(id[0])

        name = re.compile(
            r'<h1 class="moviename-big" itemprop="name">(.+?)</h1>').findall(
                content)
        if name:
            instance.name = self.prepare_str(name[0])

        name_original = re.compile(
            r'<span itemprop="alternativeHeadline">([\w\s]+)\s+</span>'
        ).findall(content)
        if name_original:
            instance.name_original = self.prepare_str(name_original[0])

        content_info = re.compile(
            r'<tr\s*>\s*<td class="type">(.+?)</td>\s*<td[^>]*>(.+?)</td>\s*</tr>'
        ).findall(content)
        for name, value in content_info:
            if name.encode('utf-8') == 'дата рождения':
                year_birth = re.compile(
                    r'<a href="/lists/m_act%5Bbirthday%5D%5Byear%5D/\d{4}/">(\d{4})</a>'
                ).findall(value)
                if year_birth:
                    instance.year_birth = self.prepare_int(year_birth[0])

        if instance.id:
            response = get_request(instance.get_url('info'))
            if response.content:
                instance.information = response.content.decode(
                    'windows-1251', 'ignore').replace(' class="trivia"', '')

        instance.set_source('main_page')