class DOMCompanyParser(DOMParserBase):
    """Parser for the main page of a given company.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMCompanyParser()
        result = cparser.parse(company_html_string)
    """
    _containsObjects = True

    extractors = [
            Extractor(label='name',
                        path="//h1/span[@class='display-title ']",  # note the extra trailing space in class
                        attrs=Attribute(key='name',
                            path="./text()",
                        postprocess=lambda x: \
                                analyze_company_name(x, stripNotes=True))),

            Extractor(label='filmography',
                        group="//b/a[@name]",
                        group_key="./text()",
                        group_key_normalize=lambda x: x.lower(),
                        path="../following-sibling::ol[1]/li",
                        attrs=Attribute(key=None,
                            multi=True,
                            path={
                                'link': "./a[1]/@href",
                                'title': "./a[1]/text()",
                                'year': "./text()[1]"
                                },
                            postprocess=lambda x:
                                build_movie(u'%s %s' % \
                                (x.get('title'), x.get('year').strip()),
                                movieID=analyze_imdbid(x.get('link') or u''),
                                _parsingCompany=True))),
            ]

    preprocessors = [
        (re.compile('(<b><a name=)', re.I), r'</p>\1')
        ]

    def postprocess_data(self, data):
        for key in data.keys():
            new_key = key.replace('company', 'companies')
            new_key = new_key.replace('other', 'miscellaneous')
            new_key = new_key.replace('distributor', 'distributors')
            if new_key != key:
                data[new_key] = data[key]
                del data[key]
        return data
 def _init(self):
     self.preprocessors += [('<span class="tv-extra">TV mini-series</span>',
                             '<span class="tv-extra">(mini)</span>')]
     self.extractors = [Extractor(label='title',
                             path="//h1",
                             attrs=Attribute(key='title',
                                             path=self._titleAttrPath,
                                             postprocess=self._titleFunct)),
                         Extractor(label='link',
                             path=self._linkPath,
                             attrs=Attribute(key='link', path="./@href",
                             postprocess=lambda x: \
                                     analyze_imdbid((x or u'').replace(
                                         'http://pro.imdb.com', ''))
                                 ))]
class DOMHTMLPersonSalesParser(DOMParserBase):
    """Parser for the "merchandising links" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        sparser = DOMHTMLPersonSalesParser()
        result = sparser.parse(sales_html_string)
    """
    extractors = [
        Extractor(label='merchandising links',
                    group="//span[@class='merch_title']",
                    group_key=".//text()",
                    path="./following-sibling::table[1]/" \
                            "/td[@class='w_rowtable_colshop']//tr[1]",
                    attrs=Attribute(key=None,
                        multi=True,
                        path={
                            'link': "./td[2]/a[1]/@href",
                            'text': "./td[1]/img[1]/@alt",
                            'cover': "./ancestor::td[1]/../" \
                                    "td[1]/a[1]/img[1]/@src",
                            },
                        postprocess=lambda x: _parse_merchandising_link(x))),
    ]

    preprocessors = [(re.compile('(<a name="[^"]+" )/>', re.I), r'\1></a>')]

    def postprocess_data(self, data):
        if len(data) == 0:
            return {}
        return {'merchandising links': data}
class DOMHTMLOtherWorksParser(DOMParserBase):
    """Parser for the "other works" and "agent" pages of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        owparser = DOMHTMLOtherWorksParser()
        result = owparser.parse(otherworks_html_string)
    """
    _defGetRefs = True
    kind = 'other works'

    # XXX: looks like the 'agent' page is no more public.
    extractors = [
            Extractor(label='other works',
                        path="//h5[text()='Other works']/" \
                                "following-sibling::div[1]",
                        attrs=Attribute(key='self.kind',
                            path=".//text()",
                            postprocess=lambda x: x.strip().split('\n\n')))
            ]

    preprocessors = [(re.compile('(<h5>[^<]+</h5>)',
                                 re.I), r'</div>\1<div class="_imdbpy">'),
                     (re.compile('(</table>\n</div>\s+)</div>', re.I), r'\1'),
                     (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
                     (re.compile('<br/><br/>', re.I), r'\n\n')]
Exemple #5
0
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for persons."""
    _BaseParser = DOMBasicPersonParser
    _notDirectHitTitle = '<title>imdb name'
    _titleBuilder = lambda self, x: build_name(x, canonical=True)
    _linkPrefix = '/name/nm'

    _attrs = [Attribute(key='data',
                        multi=True,
                        path={
                            'link': "./a[1]/@href",
                            'name': "./a[1]/text()",
                            'index': "./text()[1]",
                            'akas': ".//div[@class='_imdbpyAKA']/text()"
                            },
                        postprocess=lambda x: (
                            analyze_imdbid(x.get('link') or u''),
                            analyze_name((x.get('name') or u'') + \
                                        (x.get('index') or u''),
                                         canonical=1), x.get('akas')
                        ))]
    extractors = [
        Extractor(label='search',
                  path="//td[3]/a[starts-with(@href, '/name/nm')]/..",
                  attrs=_attrs)
    ]

    def preprocess_string(self, html_string):
        if self._notDirectHitTitle in html_string[:1024].lower():
            html_string = _reAKASp.sub(
                r'\1<div class="_imdbpyAKA">\2::</div>\3', html_string)
        return DOMHTMLSearchMovieParser.preprocess_string(self, html_string)
class DOMHTMLSearchMovieKeywordParser(DOMHTMLSearchMovieParser):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, searching for movies with the given
    keyword."""

    _notDirectHitTitle = '<title>best'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'info': "./a[1]//text()",
                      'ynote': "./span[@class='desc']/text()",
                      'outline': "./span[@class='outline']//text()"
                  },
                  postprocess=lambda x: (analyze_imdbid(x.get('link') or u''),
                                         custom_analyze_title4kwd(
                                             x.get('info') or u'',
                                             x.get('ynote') or u'',
                                             x.get('outline') or u'')))
    ]

    extractors = [Extractor(label='search',
                            path="//td[3]/a[starts-with(@href, " \
                                    "'/title/tt')]/..",
                            attrs=_attrs)]
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for persons."""
    _BaseParser = DOMBasicPersonParser
    _notDirectHitTitle = '<title>imdb name'
    _titleBuilder = lambda self, x: build_name(x, canonical=True)
    _linkPrefix = '/name/nm'

    _attrs = [
        Attribute(
            key='data',
            multi=True,
            path={
                'link': "./a[1]/@href",
                'name': "./a[1]/text()",
                'index': "./text()[1]"
            },
            postprocess=lambda x:
            (analyze_imdbid(x.get('link') or u''),
             analyze_name(x.get('name') +
                          (x.get('index') or u''), canonical=1)))
    ]
    extractors = [
        Extractor(label='search',
                  path="//td[3]/a[starts-with(@href, '/name/nm')]/..",
                  attrs=_attrs)
    ]
class DOMHTMLCharacterQuotesParser(DOMParserBase):
    """Parser for the "quotes" page of a given character.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        qparser = DOMHTMLCharacterQuotesParser()
        result = qparser.parse(character_quotes_html_string)
    """
    _defGetRefs = True

    extractors = [
        Extractor(label='introduction',
                    group="//h5",
                    group_key="./a/text()",
                    path="./following-sibling::div[1]",
                    attrs=Attribute(key=None,
                        path=".//text()",
                        postprocess=lambda x: x.strip().replace(':   ',
                                    ': ').replace(':  ', ': ').split('||'))),
    ]

    preprocessors = [
        (re.compile('(</h5>)', re.I), r'\1<div>'),
        (re.compile('\s*<br/><br/>\s*', re.I), r'||'),
        (re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'),
        (re.compile('\s*<br/>\s*', re.I), r'::')
        ]

    def postprocess_data(self, data):
        if len(data) == 0:
            return {}
        return {'quotes': data}
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
    _BaseParser = DOMBasicCompanyParser
    _notDirectHitTitle = '<title>find - imdb'
    _titleBuilder = lambda self, x: build_company_name(x)
    _linkPrefix = '/company/co'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'name': "./a[1]/text()",
                      'notes': "./text()[1]"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link')),
                   analyze_company_name(x.get('name') +
                                        (x.get('notes') or u''),
                                        stripNotes=True)))
    ]

    extractors = [
        Extractor(
            label='search',
            path=
            "//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..",
            attrs=_attrs)
    ]
class DOMHTMLCharacterBioParser(DOMParserBase):
    """Parser for the "biography" page of a given character.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bparser = DOMHTMLCharacterBioParser()
        result = bparser.parse(character_biography_html_string)
    """
    _defGetRefs = True

    extractors = [
            Extractor(label='introduction',
                        path="//div[@id='_intro']",
                        attrs=Attribute(key='introduction',
                            path=".//text()",
                            postprocess=lambda x: x.strip())),

            Extractor(label='biography',
                        path="//span[@class='_biography']",
                        attrs=Attribute(key='biography',
                            multi=True,
                            path={
                                'info': "./preceding-sibling::h4[1]//text()",
                                'text': ".//text()",
                            },
                            postprocess=lambda x: u'%s::%s' % (
                                x.get('info').strip(),
                                x.get('text').replace('\n',
                                    ' ').replace('||', '\n\n').strip()))),
    ]

    preprocessors = [
        (re.compile('(<div id="swiki.2.3.1">)', re.I), r'\1<div id="_intro">'),
        (re.compile('(<a name="history">)\s*(<table .*?</table>)',
                    re.I | re.DOTALL),
         r'</div>\2\1</a>'),
        (re.compile('(<a name="[^"]+">)(<h4>)', re.I), r'</span>\1</a>\2'),
        (re.compile('(</h4>)</a>', re.I), r'\1<span class="_biography">'),
        (re.compile('<br/><br/>', re.I), r'||'),
        (re.compile('\|\|\n', re.I), r'</span>'),
        ]
class DOMHTMLSeriesParser(DOMParserBase):
    """Parser for the "by TV series" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        sparser = DOMHTMLSeriesParser()
        result = sparser.parse(filmoseries_html_string)
    """
    _containsObjects = True

    extractors = [
        Extractor(label='series',
                  group="//div[@class='filmo']/span[1]",
                  group_key="./a[1]",
                  path="./following-sibling::ol[1]/li/a[1]",
                  attrs=Attribute(
                      key=None,
                      multi=True,
                      path={
                          'link': "./@href",
                          'title': "./text()",
                          'info': "./following-sibling::text()",
                          'role': "./following-sibling::i[1]/text()",
                          'roleA': "./following-sibling::a[1]/text()",
                          'roleAID': "./following-sibling::a[1]/@href"
                      },
                      postprocess=lambda x: _build_episode(
                          x.get('link'), x.get('title'),
                          (x.get('info') or u'').strip(), x.get('role'),
                          x.get('roleA'), x.get('roleAID'))))
    ]

    def postprocess_data(self, data):
        if len(data) == 0:
            return {}
        nd = {}
        for key in data.keys():
            dom = self.get_dom(key)
            link = self.xpath(dom, "//a/@href")[0]
            title = self.xpath(dom, "//a/text()")[0][1:-1]
            series = Movie(movieID=analyze_imdbid(link),
                           data=analyze_title(title),
                           accessSystem=self._as,
                           modFunct=self._modFunct)
            nd[series] = []
            for episode in data[key]:
                # XXX: should we create a copy of 'series', to avoid
                #      circular references?
                episode['episode of'] = series
                nd[series].append(episode)
        return {'episodes': nd}
Exemple #12
0
 def _init(self):
     self.extractors = [
         Extractor(label=self.label,
                   path="//div[@id='main']//table//tr",
                   attrs=Attribute(key=None,
                                   multi=True,
                                   path={
                                       self.ranktext: "./td[1]//text()",
                                       'rating': "./td[2]//text()",
                                       'title': "./td[3]//text()",
                                       'movieID': "./td[3]//a/@href",
                                       'votes': "./td[4]//text()"
                                   }))
     ]
class DOMHTMLSearchKeywordParser(DOMHTMLSearchMovieParser):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, searching for keywords similar to
    the one given."""

    _BaseParser = DOMBasicKeywordParser
    _notDirectHitTitle = '<title>imdb keyword'
    _titleBuilder = lambda self, x: x
    _linkPrefix = '/keyword/'

    _attrs = [Attribute(key='data', multi=True, path="./a[1]/text()")]
    extractors = [Extractor(label='search',
                            path="//td[3]/a[starts-with(@href, " \
                                    "'/keyword/')]/..",
                            attrs=_attrs)]
Exemple #14
0
class DOMHTMLCharacterQuotesParser(DOMParserBase):
    """Parser for the "quotes" page of a given character.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        qparser = DOMHTMLCharacterQuotesParser()
        result = qparser.parse(character_quotes_html_string)
    """
    _defGetRefs = True

    extractors = [
        Extractor(label='charquotes',
                  group="//h5",
                  group_key="./a/text()",
                  path="./following-sibling::div[1]",
                  attrs=Attribute(
                      key=None,
                      path={
                          'txt': ".//text()",
                          'movieID': ".//a[1]/@href"
                      },
                      postprocess=lambda x:
                      (analyze_imdbid(x['movieID']), x['txt'].strip().replace(
                          ':   ', ': ').replace(':  ', ': ').split('||'))))
    ]

    preprocessors = [(re.compile('(</h5>)', re.I), r'\1<div>'),
                     (re.compile('\s*<br/><br/>\s*', re.I), r'||'),
                     (re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'),
                     (re.compile('\s*<br/>\s*', re.I), r'::')]

    def postprocess_data(self, data):
        if not data:
            return {}
        newData = {}
        for title in data:
            movieID, quotes = data[title]
            if movieID is None:
                movie = title
            else:
                movie = Movie(title=title,
                              movieID=movieID,
                              accessSystem=self._as,
                              modFunct=self._modFunct)
            newData[movie] = [quote.split('::') for quote in quotes]
        return {'quotes': newData}
Exemple #15
0
class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
    _BaseParser = DOMBasicCharacterParser
    _notDirectHitTitle = '<title>imdb search'
    _titleBuilder = lambda self, x: build_name(x, canonical=False)
    _linkPrefix = '/character/ch'

    _attrs = [Attribute(key='data',
                        multi=True,
                        path={
                            'link': "./a[1]/@href",
                            'name': "./a[1]/text()"
                            },
                        postprocess=lambda x: (
                            analyze_imdbid(x.get('link') or u''),
                            {'name': x.get('name')}
                        ))]
    extractors = [Extractor(label='search',
                            path="//td[3]/a[starts-with(@href, " \
                                    "'/character/ch')]/..",
                            attrs=_attrs)]
class DOMHTMLOtherWorksParser(DOMParserBase):
    """Parser for the "other works" and "agent" pages of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        owparser = DOMHTMLOtherWorksParser()
        result = owparser.parse(otherworks_html_string)
    """
    _defGetRefs = True
    kind = 'other works'

    extractors = [
        Extractor(label='other works',
                  path="//li[@class='ipl-zebra-list__item']",
                  attrs=Attribute(key='other works',
                                  path=".//text()",
                                  multi=True,
                                  postprocess=lambda x: x.strip()))
    ]
class DOMHTMLPersonGenresParser(DOMParserBase):
    """Parser for the "by genre" and "by keywords" pages of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        gparser = DOMHTMLPersonGenresParser()
        result = gparser.parse(bygenre_html_string)
    """
    kind = 'genres'
    _containsObjects = True

    extractors = [
            Extractor(label='genres',
                        group="//b/a[@name]/following-sibling::a[1]",
                        group_key="./text()",
                        group_key_normalize=lambda x: x.lower(),
                        path="../../following-sibling::ol[1]/li//a[1]",
                        attrs=Attribute(key=None,
                            multi=True,
                            path={
                                'link': "./@href",
                                'title': "./text()",
                                'info': "./following-sibling::text()"
                                },
                            postprocess=lambda x: \
                                    build_movie(x.get('title') + \
                                    x.get('info').split('[')[0],
                                    analyze_imdbid(x.get('link')))))
            ]

    def postprocess_data(self, data):
        if len(data) == 0:
            return {}
        return {self.kind: data}
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>find - imdb</title>'
    _titleBuilder = lambda self, x: build_title(x)
    _linkPrefix = '/title/tt'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'info': ".//text()",
                      'akas': "./i//text()"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link') or u''),
                   custom_analyze_title(x.get('info') or u''), x.get('akas')))
    ]
    extractors = [
        Extractor(label='search',
                  path="//td[@class='result_text']",
                  attrs=_attrs)
    ]

    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):

        if self._notDirectHitTitle in html_string[:10240].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                # XXX (HTU): does this still apply?
                html_string = html_string.replace('(TV mini-series)', '(mini)')
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link,
                                                                         title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        # Horrible hack to support AKAs.
        if data and data['data'] and len(data['data'][0]) == 3 and \
                isinstance(data['data'][0], tuple):
            data['data'] = [x for x in data['data'] if x[0] and x[1]]
            for idx, datum in enumerate(data['data']):
                if not isinstance(datum, tuple):
                    continue
                if not datum[0] and datum[1]:
                    continue
                if datum[2] is not None:
                    #akas = filter(None, datum[2].split('::'))
                    if self._linkPrefix == '/title/tt':
                        # XXX (HTU): couldn't find a result with multiple akas
                        aka = datum[2]
                        akas = [aka[1:-1]]  # remove the quotes
                        #akas = [a.replace('" - ', '::').rstrip() for a in akas]
                        #akas = [a.replace('aka "', '', 1).replace('aka  "',
                        #'', 1).lstrip() for a in akas]
                    datum[1]['akas'] = akas
                    data['data'][idx] = (datum[0], datum[1])
                else:
                    data['data'][idx] = (datum[0], datum[1])
        return data

    def add_refs(self, data):
        return data
class DOMHTMLMaindetailsParser(DOMParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMHTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    _containsObjects = True

    _birth_attrs = [Attribute(key='birth date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/BornInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='birth notes',
                        path="./a[starts-with(@href, '/BornWhere?')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/DiedInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='death notes',
                        path="./text()",
                        # TODO: check if this slicing is always correct
                        postprocess=lambda x: x.strip()[2:])]
    _film_attrs = [
        Attribute(key=None,
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'title': ".//text()",
                      'status': "./i/a//text()",
                      'roleID': "./div[@class='_imdbpyrole']/@roleid"
                  },
                  postprocess=lambda x: build_movie(
                      x.get('title') or u'',
                      movieID=analyze_imdbid(x.get('link') or u''),
                      roleID=(x.get('roleID') or u'').split('/'),
                      status=x.get('status') or None))
    ]

    extractors = [
        Extractor(label='page title',
                  path="//title",
                  attrs=Attribute(
                      key='name',
                      path="./text()",
                      postprocess=lambda x: analyze_name(x, canonical=1))),
        Extractor(label='birth info',
                  path="//div[h5='Date of Birth:']",
                  attrs=_birth_attrs),
        Extractor(label='death info',
                  path="//div[h5='Date of Death:']",
                  attrs=_death_attrs),
        Extractor(label='headshot',
                  path="//a[@name='headshot']",
                  attrs=Attribute(key='headshot', path="./img/@src")),
        Extractor(label='akas',
                  path="//div[h5='Alternate Names:']",
                  attrs=Attribute(
                      key='akas',
                      path="./text()",
                      postprocess=lambda x: x.strip().split(' | '))),
        Extractor(label='filmography',
                  group="//div[@class='filmo'][h5]",
                  group_key="./h5/a[@name]/text()",
                  group_key_normalize=lambda x: x.lower()[:-1],
                  path="./ol/li",
                  attrs=_film_attrs)
    ]
    preprocessors = [
        # XXX: check that this doesn't cut "status" or other info...
        (re.compile(r'<br>(\.\.\.|    ?).+?</li>',
                    re.I | re.M | re.S), '</li>'),
        (_reRoles, _manageRoles)
    ]
class DOMHTMLMaindetailsParser(DOMParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMHTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    _containsObjects = True
    _name_imdb_index = re.compile(r'\([IVXLCDM]+\)')

    _birth_attrs = [Attribute(key='birth date',
                        path='.//time[@itemprop="birthDate"]/@datetime'),
                    Attribute(key='birth place',
                        path=".//a[starts-with(@href, " \
                                "'/search/name?birth_place=')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path='.//time[@itemprop="deathDate"]/@datetime'),
                    Attribute(key='death place',
                        path=".//a[starts-with(@href, " \
                                "'/search/name?death_place=')]/text()")]
    _film_attrs = [
        Attribute(key=None,
                  multi=True,
                  path={
                      'link': "./b/a[1]/@href",
                      'title': "./b/a[1]/text()",
                      'notes': "./b/following-sibling::text()",
                      'year': "./span[@class='year_column']/text()",
                      'status': "./a[@class='in_production']/text()",
                      'rolesNoChar': './/br/following-sibling::text()',
                      'chrRoles': "./a[@imdbpyname]/@imdbpyname",
                      'roleID': "./a[starts-with(@href, '/character/')]/@href"
                  },
                  postprocess=lambda x: build_movie(
                      x.get('title') or u'',
                      year=x.get('year'),
                      movieID=analyze_imdbid(x.get('link') or u''),
                      rolesNoChar=(x.get('rolesNoChar') or u'').strip(),
                      chrRoles=(x.get('chrRoles') or u'').strip(),
                      additionalNotes=x.get('notes'),
                      roleID=(x.get('roleID') or u''),
                      status=x.get('status') or None))
    ]

    extractors = [
            Extractor(label='name',
                        path="//h1[@class='header']",
                        attrs=Attribute(key='name',
                            path=".//text()",
                            postprocess=lambda x: analyze_name(x,
                                                               canonical=1))),
            Extractor(label='name_index',
                        path="//h1[@class='header']/span[1]",
                        attrs=Attribute(key='name_index',
                            path="./text()")),

            Extractor(label='birth info',
                        path="//div[h4='Born:']",
                        attrs=_birth_attrs),

            Extractor(label='death info',
                        path="//div[h4='Died:']",
                        attrs=_death_attrs),

            Extractor(label='headshot',
                        path="//td[@id='img_primary']/div[@class='image']/a",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),

            Extractor(label='akas',
                        path="//div[h4='Alternate Names:']",
                        attrs=Attribute(key='akas',
                            path="./text()",
                            postprocess=lambda x: x.strip().split('  '))),

            Extractor(label='filmography',
                        group="//div[starts-with(@id, 'filmo-head-')]",
                        group_key="./a[@name]/text()",
                        group_key_normalize=lambda x: x.lower().replace(': ', ' '),
                        path="./following-sibling::div[1]" \
                                "/div[starts-with(@class, 'filmo-row')]",
                        attrs=_film_attrs),

            Extractor(label='indevelopment',
                        path="//div[starts-with(@class,'devitem')]",
                        attrs=Attribute(key='in development',
                            multi=True,
                            path={
                                'link': './a/@href',
                                'title': './a/text()'
                                },
                                postprocess=lambda x:
                                    build_movie(x.get('title') or u'',
                                        movieID=analyze_imdbid(x.get('link') or u''),
                                        roleID=(x.get('roleID') or u'').split('/'),
                                        status=x.get('status') or None)))
            ]

    preprocessors = [
        ('<div class="clear"/> </div>', ''), ('<br/>', '<br />'),
        (re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'),
         r'\1 imdbpyname="\2@@">\2</a>')
    ]

    def postprocess_data(self, data):
        for what in 'birth date', 'death date':
            if what in data and not data[what]:
                del data[what]
        name_index = (data.get('name_index') or '').strip()
        if name_index:
            if self._name_imdb_index.match(name_index):
                data['imdbIndex'] = name_index[1:-1]
            del data['name_index']
        # XXX: the code below is for backwards compatibility
        # probably could be removed
        for key in data.keys():
            if key.startswith('actor '):
                if not data.has_key('actor'):
                    data['actor'] = []
                data['actor'].extend(data[key])
                del data[key]
            if key.startswith('actress '):
                if not data.has_key('actress'):
                    data['actress'] = []
                data['actress'].extend(data[key])
                del data[key]
            if key.startswith('self '):
                if not data.has_key('self'):
                    data['self'] = []
                data['self'].extend(data[key])
                del data[key]
            if key == 'birth place':
                data['birth notes'] = data[key]
                del data[key]
            if key == 'death place':
                data['death notes'] = data[key]
                del data[key]
        return data
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
    """Parser for the "biography" page of a given character.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bparser = DOMHTMLCharacterMaindetailsParser()
        result = bparser.parse(character_biography_html_string)
    """
    _containsObjects = True

    _film_attrs = [Attribute(key=None,
                      multi=True,
                      path={
                          'link': "./a[1]/@href",
                          'title': ".//text()",
                          'status': "./i/a//text()",
                          'roleID': "./a/@href"
                          },
                      postprocess=lambda x:
                          build_movie(x.get('title') or u'',
                              movieID=analyze_imdbid(x.get('link') or u''),
                              roleID=_personIDs.findall(x.get('roleID') or u''),
                              status=x.get('status') or None,
                              _parsingCharacter=True))]

    extractors = [
            Extractor(label='title',
                        path="//title",
                        attrs=Attribute(key='name',
                            path="./text()",
                            postprocess=lambda x: \
                                    x.replace(' (Character)', '').strip())),

            Extractor(label='headshot',
                        path="//a[@name='headshot']",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),

            Extractor(label='akas',
                        path="//div[h5='Alternate Names:']",
                        attrs=Attribute(key='akas',
                            path="./text()",
                            postprocess=lambda x: x.strip().split(' / '))),

            Extractor(label='filmography',
                        path="//div[@class='filmo'][not(h5)]/ol/li",
                        attrs=_film_attrs),

            Extractor(label='filmography sections',
                        group="//div[@class='filmo'][h5]",
                        group_key="./h5/a/text()",
                        group_key_normalize=lambda x: x.lower()[:-1],
                        path="./ol/li",
                        attrs=_film_attrs),
            ]

    preprocessors = [
            # Check that this doesn't cut "status"...
            (re.compile(r'<br>(\.\.\.|   ).+?</li>', re.I | re.M), '</li>')]
Exemple #22
0
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>imdb title'
    _titleBuilder = lambda self, x: build_title(x)
    _linkPrefix = '/title/tt'

    _attrs = [Attribute(key='data',
                        multi=True,
                        path={
                            'link': "./a[1]/@href",
                            'info': ".//text()",
                            #'akas': ".//div[@class='_imdbpyAKA']//text()"
                            'akas': ".//p[@class='find-aka']//text()"
                            },
                        postprocess=lambda x: (
                            analyze_imdbid(x.get('link') or u''),
                            custom_analyze_title(x.get('info') or u''),
                            x.get('akas')
                        ))]
    extractors = [Extractor(label='search',
                        path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
                        attrs=_attrs)]
    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):
        if self._notDirectHitTitle in html_string[:1024].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                html_string = html_string.replace('(TV mini-series)', '(mini)')
                html_string = html_string.replace('<p class="find-aka">',
                        '<p class="find-aka">::')
                #html_string = _reAKAStitles.sub(
                #        r'<div class="_imdbpyAKA">\1::</div>\2', html_string)
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
                                                                    title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        # Horrible hack to support AKAs.
        if data and data['data'] and len(data['data'][0]) == 3 and \
                isinstance(data['data'][0], tuple):
            for idx, datum in enumerate(data['data']):
                if datum[2] is not None:
                    akas = filter(None, datum[2].split('::'))
                    if self._linkPrefix == '/title/tt':
                        akas = [a.replace('" - ', '::').rstrip() for a in akas]
                        akas = [a.replace('aka "', '', 1).lstrip() for a in akas]
                    datum[1]['akas'] = akas
                    data['data'][idx] = (datum[0], datum[1])
                else:
                    data['data'][idx] = (datum[0], datum[1])
        return data

    def add_refs(self, data):
        return data
class DOMHTMLBioParser(DOMParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bioparser = DOMHTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    _defGetRefs = True

    _birth_attrs = [Attribute(key='birth date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/search/name?birth_monthday=')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/search/name?birth_year=')]/text()"
                            },
                        postprocess=build_date),
                    Attribute(key='birth notes',
                        path="./a[starts-with(@href, " \
                                "'/search/name?birth_place=')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/search/name?death_monthday=')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/search/name?death_date=')]/text()"
                            },
                        postprocess=build_date),
                    Attribute(key='death notes',
                        path="./text()",
                        # TODO: check if this slicing is always correct
                        postprocess=lambda x: u''.join(x).strip()[2:])]
    extractors = [
            Extractor(label='headshot',
                        path="//a[@name='headshot']",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),
            Extractor(label='birth info',
                        path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]",
                        attrs=_birth_attrs),
            Extractor(label='death info',
                        path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
                        attrs=_death_attrs),
            Extractor(label='nick names',
                        path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]",
                        attrs=Attribute(key='nick names',
                            path="./text()",
                            joiner='|',
                            postprocess=lambda x: [n.strip().replace(' (',
                                    '::(', 1) for n in x.split('|')
                                    if n.strip()])),
            Extractor(label='birth name',
                        path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]",
                        attrs=Attribute(key='birth name',
                            path="./text()",
                            postprocess=lambda x: canonicalName(x.strip()))),
            Extractor(label='height',
                path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
                        attrs=Attribute(key='height',
                            path="./text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='mini biography',
                        path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
                        attrs=Attribute(key='mini biography',
                            multi=True,
                            path={
                                'bio': ".//text()",
                                'by': ".//a[@name='ba']//text()"
                                },
                            postprocess=lambda x: "%s::%s" % \
                                ((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
                                (x.get('by') or u'').strip() or u'Anonymous'))),
            Extractor(label='spouse',
                        path="//div[h5='Spouse']/table/tr",
                        attrs=Attribute(key='spouse',
                            multi=True,
                            path={
                                'name': "./td[1]//text()",
                                'info': "./td[2]//text()"
                                },
                            postprocess=lambda x: ("%s::%s" % \
                                (x.get('name').strip(),
                                (x.get('info') or u'').strip())).strip(':'))),
            Extractor(label='trade mark',
                        path="//div[h5='Trade Mark']/p",
                        attrs=Attribute(key='trade mark',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='trivia',
                        path="//div[h5='Trivia']/p",
                        attrs=Attribute(key='trivia',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='quotes',
                        path="//div[h5='Personal Quotes']/p",
                        attrs=Attribute(key='quotes',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='salary',
                        path="//div[h5='Salary']/table/tr",
                        attrs=Attribute(key='salary history',
                            multi=True,
                            path={
                                'title': "./td[1]//text()",
                                'info': "./td[2]/text()",
                                },
                            postprocess=lambda x: "%s::%s" % \
                                    (x.get('title').strip(),
                                        x.get('info').strip()))),
            Extractor(label='where now',
                        path="//div[h5='Where Are They Now']/p",
                        attrs=Attribute(key='where now',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            ]

    preprocessors = [(re.compile('(<h5>)',
                                 re.I), r'</div><div class="_imdbpy">\1'),
                     (re.compile('(</table>\n</div>\s+)</div>',
                                 re.I + re.DOTALL), r'\1'),
                     (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
                     (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]

    def postprocess_data(self, data):
        for what in 'birth date', 'death date':
            if what in data and not data[what]:
                del data[what]
        return data
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>imdb title'
    _titleBuilder = lambda self, x: build_title(x, canonical=True)
    _linkPrefix = '/title/tt'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'info': ".//text()"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link') or u''),
                   analyze_title(x.get('info') or u'', canonical=1)))
    ]
    extractors = [
        Extractor(label='search',
                  path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
                  attrs=_attrs)
    ]

    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):
        if self._notDirectHitTitle in html_string[:1024].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                html_string = html_string.replace('(TV mini-series)', '(mini)')
                html_string = _reAKAS.sub('</td>', html_string)
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
                                                                       title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        return data

    def add_refs(self, data):
        return data
class DOMHTMLResumeParser(DOMParserBase):
    """Parser for the "resume" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        resumeparser = DOMHTMLResumeParser()
        result = resumeparser.parse(resume_html_string)
    """
    _defGetRefs = True

    extractors = [
        Extractor(
            label='info',
            group="//div[@class='section_box']",
            group_key="./h3/text()",
            group_key_normalize=lambda x: x.lower().replace(' ', '_'),
            path="./ul[@class='resume_section_multi_list']//li",
            attrs=Attribute(
                key=None,
                multi=True,
                path={
                    'title': ".//b//text()",
                    'desc': ".//text()",
                },
                postprocess=lambda x:
                (x.get('title'), x.get('desc').strip().replace('\n', ' ')))),
        Extractor(label='other_info',
                  group="//div[@class='section_box']",
                  group_key="./h3/text()",
                  group_key_normalize=lambda x: x.lower().replace(' ', '_'),
                  path="./ul[@class='_imdbpy']//li",
                  attrs=Attribute(
                      key=None,
                      multi=True,
                      path=".//text()",
                      postprocess=lambda x: x.strip().replace('\n', ' '))),
        Extractor(
            label='credits',
            group="//div[@class='section_box']",
            group_key="./h3/text()",
            group_key_normalize=lambda x: x.lower().replace(' ', '_'),
            path="./table[@class='credits']//tr",
            attrs=Attribute(
                key=None,
                multi=True,
                path={
                    '0': ".//td[1]//text()",
                    '1': ".//td[2]//text()",
                    '2': ".//td[3]//text()",
                },
                postprocess=lambda x: [x.get(
                    '0'), x.get('1'), x.get('2')])),
        Extractor(label='mini_info',
                  path="//div[@class='center']",
                  attrs=Attribute(
                      key='mini_info',
                      path=".//text()",
                      postprocess=lambda x: x.strip().replace('\n', ' '))),
        Extractor(label='name',
                  path="//div[@class='center']/h1[@id='preview_user_name']",
                  attrs=Attribute(
                      key='name',
                      path=".//text()",
                      postprocess=lambda x: x.strip().replace('\n', ' '))),
        Extractor(label='resume_bio',
                  path="//div[@id='resume_rendered_html']//p",
                  attrs=Attribute(key='resume_bio',
                                  multi=True,
                                  path=".//text()")),
    ]

    preprocessors = [
        (re.compile('(<ul>)', re.I), r'<ul class="_imdbpy">\1'),
    ]

    def postprocess_data(self, data):

        for key in data.keys():
            if data[key] == '':
                del data[key]
            if key in ('mini_info', 'name', 'resume_bio'):
                if key == 'resume_bio':
                    data[key] = "".join(data[key]).strip()
                continue
            if len(data[key][0]) == 3:
                for item in data[key]:
                    item[:] = [x for x in item if not x is None]
                continue

            if len(data[key][0]) == 2:
                new_key = {}
                for item in data[key]:
                    if item[0] is None:
                        continue
                    if ':' in item[0]:
                        if item[1].replace(item[0], '')[1:].strip() == '':
                            continue
                        new_key[item[0].strip().replace(':',
                                                        '')] = item[1].replace(
                                                            item[0],
                                                            '')[1:].strip()
                    else:
                        new_key[item[0]] = item[1]
                data[key] = new_key

        new_data = {'resume': data}
        return new_data
class DOMHTMLBioParser(DOMParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bioparser = DOMHTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    _defGetRefs = True

    _birth_attrs = [Attribute(key='birth date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/BornInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='birth notes',
                        path="./a[starts-with(@href, '/BornWhere?')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/DiedInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='death notes',
                        path="./text()",
                        # TODO: check if this slicing is always correct
                        postprocess=lambda x: u''.join(x).strip()[2:])]
    extractors = [
            Extractor(label='birth info',
                        path="//div[h5='Date of Birth']",
                        attrs=_birth_attrs),
            Extractor(label='death info',
                        path="//div[h5='Date of Death']",
                        attrs=_death_attrs),
            Extractor(label='nick names',
                        path="//div[h5='Nickname']",
                        attrs=Attribute(key='nick names',
                            path="./text()",
                            joiner='|',
                            postprocess=lambda x: [n.strip().replace(' (',
                                    '::(', 1) for n in x.split('|')
                                    if n.strip()])),
            Extractor(label='birth name',
                        path="//div[h5='Birth Name']",
                        attrs=Attribute(key='birth name',
                            path="./text()",
                            postprocess=lambda x: canonicalName(x.strip()))),
            Extractor(label='height',
                        path="//div[h5='Height']",
                        attrs=Attribute(key='height',
                            path="./text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='mini biography',
                        path="//div[h5='Mini Biography']",
                        attrs=Attribute(key='mini biography',
                            multi=True,
                            path={
                                'bio': "./p//text()",
                                'by': "./b/following-sibling::a/text()"
                                },
                            postprocess=lambda x: "%s::%s" % \
                                (x.get('bio').strip(),
                                (x.get('by') or u'').strip() or u'Anonymous'))),
            Extractor(label='spouse',
                        path="//div[h5='Spouse']/table/tr",
                        attrs=Attribute(key='spouse',
                            multi=True,
                            path={
                                'name': "./td[1]//text()",
                                'info': "./td[2]//text()"
                                },
                            postprocess=lambda x: "%s::%s" % \
                                            (x.get('name').strip(),
                                                x.get('info').strip()))),
            Extractor(label='trade mark',
                        path="//div[h5='Trade Mark']/p",
                        attrs=Attribute(key='trade mark',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='trivia',
                        path="//div[h5='Trivia']/p",
                        attrs=Attribute(key='trivia',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='quotes',
                        path="//div[h5='Personal Quotes']/p",
                        attrs=Attribute(key='quotes',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='salary',
                        path="//div[h5='Salary']/table/tr",
                        attrs=Attribute(key='salary history',
                            multi=True,
                            path={
                                'title': "./td[1]//text()",
                                'info': "./td[2]/text()",
                                },
                            postprocess=lambda x: "%s::%s" % \
                                    (x.get('title').strip(),
                                        x.get('info').strip()))),
            Extractor(label='where now',
                        path="//div[h5='Where Are They Now']/p",
                        attrs=Attribute(key='where now',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            ]

    preprocessors = [(re.compile('(<h5>)',
                                 re.I), r'</div><div class="_imdbpy">\1'),
                     (re.compile('(</table>\n</div>\s+)</div>',
                                 re.I + re.DOTALL), r'\1'),
                     (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
                     (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]