Example #1
0
 def _getitem(self, key):
     """Handle special keys."""
     if 'episode of' in self.data:
         if key == 'long imdb episode title':
             return build_title(self.data)
         elif key == 'series title':
             return self._getSeriesTitle(self.data['episode of'])
         elif key == 'canonical series title':
             ser_title = self._getSeriesTitle(self.data['episode of'])
             return canonicalTitle(ser_title)
         elif key == 'smart canonical series title':
             ser_title = self._getSeriesTitle(self.data['episode of'])
             return self.smartCanonicalTitle(ser_title)
         elif key == 'episode title':
             return self.data.get('title', '')
         elif key == 'canonical episode title':
             return canonicalTitle(self.data.get('title', ''))
         elif key == 'smart canonical episode title':
             return self.smartCanonicalTitle(self.data.get('title', ''))
     if 'title' in self.data:
         if key == 'title':
             return self.data['title']
         elif key == 'long imdb title':
             return build_title(self.data)
         elif key == 'canonical title':
             return canonicalTitle(self.data['title'])
         elif key == 'smart canonical title':
             return self.smartCanonicalTitle(self.data['title'])
         elif key == 'long imdb canonical title':
             return build_title(self.data, canonical=True)
         elif key == 'smart long imdb canonical title':
             return build_title(self.data, canonical=True, lang=self.guessLanguage())
     if key == 'full-size cover url':
         return self.get_fullsizeURL()
     return None
Example #2
0
 def _getitem(self, key):
     """Handle special keys."""
     if 'episode of' in self.data:
         if key == 'long imdb episode title':
             return build_title(self.data)
         elif key == 'series title':
             return self._getSeriesTitle(self.data['episode of'])
         elif key == 'canonical series title':
             ser_title = self._getSeriesTitle(self.data['episode of'])
             return canonicalTitle(ser_title)
         elif key == 'smart canonical series title':
             ser_title = self._getSeriesTitle(self.data['episode of'])
             return self.smartCanonicalTitle(ser_title)
         elif key == 'episode title':
             return self.data.get('title', '')
         elif key == 'canonical episode title':
             return canonicalTitle(self.data.get('title', ''))
         elif key == 'smart canonical episode title':
             return self.smartCanonicalTitle(self.data.get('title', ''))
     if 'title' in self.data:
         if key == 'title':
             return self.data['title']
         elif key == 'long imdb title':
             return build_title(self.data)
         elif key == 'canonical title':
             return canonicalTitle(self.data['title'])
         elif key == 'smart canonical title':
             return self.smartCanonicalTitle(self.data['title'])
         elif key == 'long imdb canonical title':
             return build_title(self.data, canonical=True)
         elif key == 'smart long imdb canonical title':
             return build_title(self.data, canonical=True, lang=self.guessLanguage())
     if key == 'full-size cover url':
         return self.get_fullsizeURL()
     return None
Example #3
0
 def _getitem(self, key):
     """Handle special keys."""
     if self.data.has_key('episode of'):
         if key == 'long imdb episode title':
             return build_title(self.data, canonical=0)
         elif key == 'series title':
             ser_title = self.data['episode of'].get('canonical title') or \
                         self.data['episode of']['title']
             return normalizeTitle(ser_title)
         elif key == 'canonical series title':
             ser_title = self.data['episode of'].get('canonical title') or \
                         self.data['episode of']['title']
             return ser_title
         elif key == 'episode title':
             return normalizeTitle(self.data.get('title', u''))
         elif key == 'canonical episode title':
             return self.data.get('title', u'')
     if self.data.has_key('title'):
         if key == 'title':
             return normalizeTitle(self.data['title'])
         elif key == 'long imdb title':
             return build_title(self.data, canonical=0)
         elif key == 'canonical title':
             return self.data['title']
         elif key == 'long imdb canonical title':
             return build_title(self.data, canonical=1)
     return None
Example #4
0
 def _getitem(self, key):
     """Handle special keys."""
     if self.data.has_key('episode of'):
         if key == 'long imdb episode title':
             return build_title(self.data)
         elif key == 'series title':
             return self.data['episode of']['title']
         elif key == 'canonical series title':
             ser_title = self.data['episode of']['title']
             return canonicalTitle(ser_title)
         elif key == 'smart canonical series title':
             ser_title = self.data['episode of']['title']
             return self.smartCanonicalTitle(ser_title)
         elif key == 'episode title':
             return self.data.get('title', u'')
         elif key == 'canonical episode title':
             return canonicalTitle(self.data.get('title', u''))
         elif key == 'smart canonical episode title':
             return self.smartCanonicalTitle(self.data.get('title', u''))
     if self.data.has_key('title'):
         if key == 'title':
             return self.data['title']
         elif key == 'long imdb title':
             return build_title(self.data)
         elif key == 'canonical title':
             return canonicalTitle(self.data['title'])
         elif key == 'smart canonical title':
             return self.smartCanonicalTitle(self.data['title'])
         elif key == 'long imdb canonical title':
             return build_title(self.data, canonical=1)
         elif key == 'smart long imdb canonical title':
             return build_title(self.data, canonical=1,
                                 lang=self.guessLanguage())
     return None
Example #5
0
 def _getitem(self, key):
     """Handle special keys."""
     if self.data.has_key('episode of'):
         if key == 'long imdb episode title':
             return build_title(self.data, canonical=0)
         elif key == 'series title':
             ser_title = self.data['episode of'].get('canonical title') or \
                         self.data['episode of']['title']
             return normalizeTitle(ser_title)
         elif key == 'canonical series title':
             ser_title = self.data['episode of'].get('canonical title') or \
                         self.data['episode of']['title']
             return ser_title
         elif key == 'episode title':
             return normalizeTitle(self.data.get('title', u''))
         elif key == 'canonical episode title':
             return self.data.get('title', u'')
     if self.data.has_key('title'):
         if key == 'title':
             return normalizeTitle(self.data['title'])
         elif key == 'long imdb title':
             return build_title(self.data, canonical=0)
         elif key == 'canonical title':
             return self.data['title']
         elif key == 'long imdb canonical title':
             return build_title(self.data, canonical=1)
     return None
Example #6
0
def titleVariations(title, fromPtdf=0):
    """Build title variations useful for searches; if fromPtdf is true,
    the input is assumed to be in the plain text data files format."""
    if fromPtdf: title1 = u''
    else: title1 = title
    title2 = title3 = u''
    if fromPtdf or re_year_index.search(title):
        # If it appears to have a (year[/imdbIndex]) indication,
        # assume that a long imdb canonical name was provided.
        titldict = analyze_title(title, canonical=1)
        # title1: the canonical name.
        title1 = titldict['title']
        if titldict['kind'] != 'episode':
            # title3: the long imdb canonical name.
            if fromPtdf: title3 = title
            else: title3 = build_title(titldict, canonical=1, ptdf=1)
        else:
            title1 = normalizeTitle(title1)
            title3 = build_title(titldict, canonical=1, ptdf=1)
    else:
        # Just a title.
        # title1: the canonical title.
        title1 = canonicalTitle(title)
        title3 = u''
    # title2 is title1 without the article, or title1 unchanged.
    if title1:
        title2 = title1
        t2s = title2.split(u', ')
        if t2s[-1].lower() in _unicodeArticles:
            title2 = u', '.join(t2s[:-1])
    return title1, title2, title3
Example #7
0
def titleVariations(title, fromPtdf=0):
    """Build title variations useful for searches; if fromPtdf is true,
    the input is assumed to be in the plain text data files format."""
    if fromPtdf: title1 = u''
    else: title1 = title
    title2 = title3 = u''
    if fromPtdf or re_year_index.search(title):
        # If it appears to have a (year[/imdbIndex]) indication,
        # assume that a long imdb canonical name was provided.
        titldict = analyze_title(title, canonical=1)
        # title1: the canonical name.
        title1 = titldict['title']
        if titldict['kind'] != 'episode':
            # title3: the long imdb canonical name.
            if fromPtdf: title3 = title
            else: title3 = build_title(titldict, canonical=1, ptdf=1)
        else:
            title1 = normalizeTitle(title1)
            title3 = build_title(titldict, canonical=1, ptdf=1)
    else:
        # Just a title.
        # title1: the canonical title.
        title1 = canonicalTitle(title)
        title3 = u''
    # title2 is title1 without the article, or title1 unchanged.
    if title1:
        title2 = title1
        t2s = title2.split(u', ')
        if t2s[-1].lower() in _articles:
            title2 = u', '.join(t2s[:-1])
    return title1, title2, title3
Example #8
0
 def _getitem(self, key):
     """Handle special keys."""
     if self.data.has_key('episode of'):
         if key == 'long imdb episode title':
             return build_title(self.data)
         elif key == 'series title':
             return self.data['episode of']['title']
         elif key == 'canonical series title':
             ser_title = self.data['episode of']['title']
             return canonicalTitle(ser_title)
         elif key == 'smart canonical series title':
             ser_title = self.data['episode of']['title']
             return self.smartCanonicalTitle(ser_title)
         elif key == 'episode title':
             return self.data.get('title', u'')
         elif key == 'canonical episode title':
             return canonicalTitle(self.data.get('title', u''))
         elif key == 'smart canonical episode title':
             return self.smartCanonicalTitle(self.data.get('title', u''))
     if self.data.has_key('title'):
         if key == 'title':
             return self.data['title']
         elif key == 'long imdb title':
             return build_title(self.data)
         elif key == 'canonical title':
             return canonicalTitle(self.data['title'])
         elif key == 'smart canonical title':
             return self.smartCanonicalTitle(self.data['title'])
         elif key == 'long imdb canonical title':
             return build_title(self.data, canonical=1)
         elif key == 'smart long imdb canonical title':
             return build_title(self.data,
                                canonical=1,
                                lang=self.guessLanguage())
     return None
Example #9
0
 def isSameTitle(self, other):
     """Return true if this and the compared object have the same
     long imdb title and/or movieID.
     """
     # XXX: obsolete?
     if not isinstance(other, self.__class__):
         return False
     if 'title' in self.data and 'title' in other.data and \
             build_title(self.data, canonical=False) == build_title(other.data, canonical=False):
         return True
     if self.accessSystem == other.accessSystem and \
             self.movieID is not None and self.movieID == other.movieID:
         return True
     return False
Example #10
0
 def isSameTitle(self, other):
     """Return true if this and the compared object have the same
     long imdb title and/or movieID.
     """
     # XXX: obsolete?
     if not isinstance(other, self.__class__):
         return False
     if 'title' in self.data and 'title' in other.data and \
             build_title(self.data, canonical=False) == build_title(other.data, canonical=False):
         return True
     if self.accessSystem == other.accessSystem and \
             self.movieID is not None and self.movieID == other.movieID:
         return True
     return False
Example #11
0
 def isSameTitle(self, other):
     """Return true if this and the compared object have the same
     long imdb title and/or movieID.
     """
     # XXX: obsolete?
     if not isinstance(other, self.__class__): return 0
     if self.data.has_key('title') and \
             other.data.has_key('title') and \
             build_title(self.data, canonical=0) == \
             build_title(other.data, canonical=0):
         return 1
     if self.accessSystem == other.accessSystem and \
             self.movieID is not None and self.movieID == other.movieID:
         return 1
     return 0
Example #12
0
 def isSameTitle(self, other):
     """Return true if this and the compared object have the same
     long imdb title and/or movieID.
     """
     # XXX: obsolete?
     if not isinstance(other, self.__class__): return 0
     if self.data.has_key('title') and \
             other.data.has_key('title') and \
             build_title(self.data, canonical=0) == \
             build_title(other.data, canonical=0):
         return 1
     if self.accessSystem == other.accessSystem and \
             self.movieID is not None and self.movieID == other.movieID:
         return 1
     return 0
Example #13
0
 def get_imdbID(self, mop):
     """Return the imdbID for the given Movie, Person, Character or Company
     object."""
     imdbID = None
     if mop.accessSystem == self.accessSystem:
         aSystem = self
     else:
         aSystem = IMDb(mop.accessSystem)
     if isinstance(mop, Movie.Movie):
         if mop.movieID is not None:
             imdbID = aSystem.get_imdbMovieID(mop.movieID)
         else:
             imdbID = aSystem.title2imdbID(
                 build_title(mop, canonical=0, ptdf=0, appendKind=False),
                 mop['kind'])
     elif isinstance(mop, Person.Person):
         if mop.personID is not None:
             imdbID = aSystem.get_imdbPersonID(mop.personID)
         else:
             imdbID = aSystem.name2imdbID(build_name(mop, canonical=1))
     elif isinstance(mop, Character.Character):
         if mop.characterID is not None:
             imdbID = aSystem.get_imdbCharacterID(mop.characterID)
         else:
             # canonical=0 ?
             imdbID = aSystem.character2imdbID(build_name(mop, canonical=1))
     elif isinstance(mop, Company.Company):
         if mop.companyID is not None:
             imdbID = aSystem.get_imdbCompanyID(mop.companyID)
         else:
             imdbID = aSystem.company2imdbID(build_company_name(mop))
     else:
         raise IMDbError('object ' + repr(mop) + \
                     ' is not a Movie, Person or Character instance')
     return imdbID
Example #14
0
 def get_imdbMovieID(self, movieID):
     """Translate a movieID in an imdbID.
     If not in the database, try an Exact Primary Title search on IMDb;
     return None if it's unable to get the imdbID.
     """
     try:
         movie = Title.get(movieID)
     except NotFoundError:
         return None
     imdbID = movie.imdbID
     if imdbID is not None: return '%07d' % imdbID
     m_dict = get_movie_data(movie.id, self._kind)
     titline = build_title(m_dict, canonical=1, ptdf=1)
     imdbID = self.title2imdbID(titline)
     # If the imdbID was retrieved from the web and was not in the
     # database, update the database (ignoring errors, because it's
     # possibile that the current user has not update privileges).
     # There're times when I think I'm a genius; this one of
     # those times... <g>
     if imdbID is not None:
         try:
             movie.imdbID = int(imdbID)
         except:
             pass
     return imdbID
Example #15
0
 def get_imdbID(self, mop):
     """Return the imdbID for the given Movie, Person, Character or Company
     object."""
     imdbID = None
     if mop.accessSystem == self.accessSystem:
         aSystem = self
     else:
         aSystem = IMDb(mop.accessSystem)
     if isinstance(mop, Movie.Movie):
         if mop.movieID is not None:
             imdbID = aSystem.get_imdbMovieID(mop.movieID)
         else:
             imdbID = aSystem.title2imdbID(build_title(mop, canonical=0,
                                             ptdf=0, appendKind=False),
                                             mop['kind'])
     elif isinstance(mop, Person.Person):
         if mop.personID is not None:
             imdbID = aSystem.get_imdbPersonID(mop.personID)
         else:
             imdbID = aSystem.name2imdbID(build_name(mop, canonical=1))
     elif isinstance(mop, Character.Character):
         if mop.characterID is not None:
             imdbID = aSystem.get_imdbCharacterID(mop.characterID)
         else:
             # canonical=0 ?
             imdbID = aSystem.character2imdbID(build_name(mop, canonical=1))
     elif isinstance(mop, Company.Company):
         if mop.companyID is not None:
             imdbID = aSystem.get_imdbCompanyID(mop.companyID)
         else:
             imdbID = aSystem.company2imdbID(build_company_name(mop))
     else:
         raise IMDbError('object ' + repr(mop) + \
                     ' is not a Movie, Person or Character instance')
     return imdbID
Example #16
0
 def _search_movie(self, title, results, _episodes=False):
     title = title.strip()
     if not title: return []
     # Search for these title variations.
     if not _episodes:
         title1, title2, title3 = titleVariations(title, fromPtdf=1)
     else:
         title1 = normalizeTitle(title)
         title2 = ''
         title3 = ''
     # XXX: only a guess: results are shrinked, to exclude Adult
     #      titles and to remove duplicated entries.
     resultsST = results * 3
     res = _scan_titles('%stitles.key' % self.__db,
                         title1, title2, title3, resultsST, _episodes)
     res[:] = [x[1] for x in res]
     # Check for adult movies.
     if not self.doAdult:
         newlist = []
         for entry in res:
             genres = getMovieMisc(movieID=entry[0],
                             dataF='%s%s.data' % (self.__db, 'genres'),
                             indexF='%s%s.index' % (self.__db, 'genres'),
                             attrIF='%sattributes.index' % self.__db,
                             attrKF='%sattributes.key' % self.__db)
             if 'Adult' not in genres: newlist.append(entry)
         res[:] = newlist
     # Get the real name, if this is an AKA.
     # XXX: duplicated code!
     new_res = []
     seen_MID = []
     for idx, (movieID, r) in enumerate(res):
         # Remove duplicates.
         # XXX: find a way to prefer titles with an AKA?  Or prefer
         #      the original title?
         if movieID in seen_MID:
             continue
         else:
             seen_MID.append(movieID)
         realMID = self._get_real_movieID(movieID)
         if movieID == realMID:
             new_res.append((movieID, r))
             continue
         if realMID in seen_MID:
             continue
         else:
             seen_MID.append(realMID)
         aka_title = build_title(r, canonical=0)
         real_title = getLabel(realMID, '%stitles.index' % self.__db,
                             '%stitles.key' % self.__db)
         if aka_title == real_title:
             new_res.append((realMID, r))
             continue
         new_r = analyze_title(real_title, canonical=1)
         new_r['akas'] = [aka_title]
         new_res.append((realMID, new_r))
     if results > 0: new_res[:] = new_res[:results]
     return new_res
Example #17
0
 def _findRefs(self, o, trefs, nrefs):
     """Find titles or names references in strings."""
     if isinstance(o, (unicode, str)):
         for title in re_titleRef.findall(o):
             a_title = analyze_title(title, canonical=0)
             rtitle = build_title(a_title, ptdf=1)
             if trefs.has_key(rtitle): continue
             movieID = self._getTitleID(rtitle)
             if movieID is None:
                 movieID = self._getTitleID(title)
             if movieID is None:
                 continue
             m = Movie(title=rtitle,
                       movieID=movieID,
                       accessSystem=self.accessSystem)
             trefs[rtitle] = m
             rtitle2 = canonicalTitle(a_title.get('title', u''))
             if rtitle2 and rtitle2 != rtitle and rtitle2 != title:
                 trefs[rtitle2] = m
             if title != rtitle:
                 trefs[title] = m
         for name in re_nameRef.findall(o):
             a_name = analyze_name(name, canonical=1)
             rname = build_name(a_name, canonical=1)
             if nrefs.has_key(rname): continue
             personID = self._getNameID(rname)
             if personID is None:
                 personID = self._getNameID(name)
             if personID is None: continue
             p = Person(name=rname,
                        personID=personID,
                        accessSystem=self.accessSystem)
             nrefs[rname] = p
             rname2 = normalizeName(a_name.get('name', u''))
             if rname2 and rname2 != rname:
                 nrefs[rname2] = p
             if name != rname and name != rname2:
                 nrefs[name] = p
     elif isinstance(o, (list, tuple)):
         for item in o:
             self._findRefs(item, trefs, nrefs)
     elif isinstance(o, dict):
         for value in o.values():
             self._findRefs(value, trefs, nrefs)
     return (trefs, nrefs)
Example #18
0
 def get_imdbID(self, mop):
     """Return the imdbID for the given Movie, Person, Character or Company
     object."""
     imdbID = None
     if mop.accessSystem == self.accessSystem:
         aSystem = self
     else:
         aSystem = IMDb(mop.accessSystem)
     if isinstance(mop, Movie.Movie):
         if mop.movieID is not None:
             imdbID = aSystem.get_imdbMovieID(mop.movieID)
         else:
             imdbID = aSystem.title2imdbID(build_title(mop, canonical=0,
                                             ptdf=0, appendKind=False),
                                             mop['kind'])
     else:
         raise IMDbError('object ' + repr(mop) + ' is not a Movie')
     return imdbID
Example #19
0
 def _findRefs(self, o, trefs, nrefs):
     """Find titles or names references in strings."""
     if isinstance(o, (UnicodeType, StringType)):
         for title in re_titleRef.findall(o):
             a_title = analyze_title(title, canonical=1)
             rtitle = build_title(a_title, canonical=1, ptdf=1)
             if trefs.has_key(rtitle): continue
             movieID = self._getTitleID(rtitle)
             if movieID is None:
                 movieID = self._getTitleID(title)
             if movieID is None:
                 continue
             m = Movie(title=rtitle, movieID=movieID,
                         accessSystem=self.accessSystem)
             trefs[rtitle] = m
             rtitle2 = canonicalTitle(a_title.get('title', u''))
             if rtitle2 and rtitle2 != rtitle and rtitle2 != title:
                 trefs[rtitle2] = m
             if title != rtitle:
                 trefs[title] = m
         for name in re_nameRef.findall(o):
             a_name = analyze_name(name, canonical=1)
             rname = build_name(a_name, canonical=1)
             if nrefs.has_key(rname): continue
             personID = self._getNameID(rname)
             if personID is None:
                 personID = self._getNameID(name)
             if personID is None: continue
             p = Person(name=rname, personID=personID,
                         accessSystem=self.accessSystem)
             nrefs[rname] = p
             rname2 = normalizeName(a_name.get('name', u''))
             if rname2 and rname2 != rname:
                 nrefs[rname2] = p
             if name != rname and name != rname2:
                 nrefs[name] = p
     elif isinstance(o, (ListType, TupleType)):
         for item in o:
             self._findRefs(item, trefs, nrefs)
     elif isinstance(o, DictType):
         for value in o.values():
             self._findRefs(value, trefs, nrefs)
     return (trefs, nrefs)
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>find - imdb</title>'
    _titleBuilder = lambda self, x: build_title(x)
    _linkPrefix = '/title/tt'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'info': ".//text()",
                      'akas': "./i//text()"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link') or u''),
                   custom_analyze_title(x.get('info') or u''), x.get('akas')))
    ]
    extractors = [
        Extractor(label='search',
                  path="//td[@class='result_text']",
                  attrs=_attrs)
    ]

    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):

        if self._notDirectHitTitle in html_string[:10240].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                # XXX (HTU): does this still apply?
                html_string = html_string.replace('(TV mini-series)', '(mini)')
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link,
                                                                         title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        # Horrible hack to support AKAs.
        if data and data['data'] and len(data['data'][0]) == 3 and \
                isinstance(data['data'][0], tuple):
            data['data'] = [x for x in data['data'] if x[0] and x[1]]
            for idx, datum in enumerate(data['data']):
                if not isinstance(datum, tuple):
                    continue
                if not datum[0] and datum[1]:
                    continue
                if datum[2] is not None:
                    #akas = filter(None, datum[2].split('::'))
                    if self._linkPrefix == '/title/tt':
                        # XXX (HTU): couldn't find a result with multiple akas
                        aka = datum[2]
                        akas = [aka[1:-1]]  # remove the quotes
                        #akas = [a.replace('" - ', '::').rstrip() for a in akas]
                        #akas = [a.replace('aka "', '', 1).replace('aka  "',
                        #'', 1).lstrip() for a in akas]
                    datum[1]['akas'] = akas
                    data['data'][idx] = (datum[0], datum[1])
                else:
                    data['data'][idx] = (datum[0], datum[1])
        return data

    def add_refs(self, data):
        return data
Example #21
0
    def _search_movie(self, title, results, _episodes=False):
        title = title.strip()
        if not title: return []
        title_dict = analyze_title(title, canonical=1)
        s_title = title_dict['title']
        if not s_title: return []
        episodeOf = title_dict.get('episode of')

        if not episodeOf:
            if not _episodes:
                s_title_split = s_title.split(', ')
                if len(s_title_split) > 1 and \
                        s_title_split[-1].lower() in _articles:
                    s_title_rebuilt = ', '.join(s_title_split[:-1])
                    if s_title_rebuilt:
                        s_title = s_title_rebuilt
        else:
            _episodes = False
            s_title = normalizeTitle(s_title)
        if isinstance(s_title, UnicodeType):
            s_title = s_title.encode('ascii', 'ignore')

        soundexCode = soundex(s_title)

        # XXX: improve the search restricting the kindID if the
        #      "kind" of the input differs from "movie"?
        condition = conditionAka = None
        if _episodes:
            condition = AND(Title.q.phoneticCode == soundexCode,
                            Title.q.kindID == self._kindRev['episode'])
            conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode,
                               AkaTitle.q.kindID == self._kindRev['episode'])
        elif title_dict['kind'] == 'episode' and episodeOf is not None:
            series_title = build_title(episodeOf, canonical=1)
            # XXX: is it safe to get "results" results?
            #      Too many?  Too few?
            serRes = results
            if serRes < 3 or serRes > 10:
                serRes = 10
            searchSeries = self._search_movie(series_title, serRes)
            seriesIDs = [result[0] for result in searchSeries]
            if seriesIDs:
                condition = AND(Title.q.phoneticCode == soundexCode,
                                IN(Title.q.episodeOfID, seriesIDs),
                                Title.q.kindID == self._kindRev['episode'])
                conditionAka = AND(
                    AkaTitle.q.phoneticCode == soundexCode,
                    IN(AkaTitle.q.episodeOfID, seriesIDs),
                    AkaTitle.q.kindID == self._kindRev['episode'])
            else:
                # XXX: bad situation: we have found no matching series;
                #      try searching everything (both episodes and
                #      non-episodes) for the title.
                condition = AND(Title.q.phoneticCode == soundexCode,
                                IN(Title.q.episodeOfID, seriesIDs))
                conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode,
                                   IN(AkaTitle.q.episodeOfID, seriesIDs))
        if condition is None:
            # XXX: excludes episodes?
            condition = AND(Title.q.kindID != self._kindRev['episode'],
                            Title.q.phoneticCode == soundexCode)
            conditionAka = AND(AkaTitle.q.kindID != self._kindRev['episode'],
                               AkaTitle.q.phoneticCode == soundexCode)

        # Up to 3 variations of the title are searched, plus the
        # long imdb canonical title, if provided.
        if not _episodes:
            title1, title2, title3 = titleVariations(title)
        else:
            title1 = title
            title2 = ''
            title3 = ''
        try:
            qr = [(q.id, get_movie_data(q.id, self._kind))
                  for q in Title.select(condition)]
            q2 = [(q.movieID, get_movie_data(q.id, self._kind, fromAka=1))
                  for q in AkaTitle.select(conditionAka)]
            qr += q2
        except NotFoundError, e:
            raise IMDbDataAccessError, \
                    'unable to search the database: "%s"' % str(e)
Example #22
0
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>imdb title'
    _titleBuilder = lambda self, x: build_title(x)
    _linkPrefix = '/title/tt'

    _attrs = [Attribute(key='data',
                        multi=True,
                        path={
                            'link': "./a[1]/@href",
                            'info': ".//text()",
                            #'akas': ".//div[@class='_imdbpyAKA']//text()"
                            'akas': ".//p[@class='find-aka']//text()"
                            },
                        postprocess=lambda x: (
                            analyze_imdbid(x.get('link') or u''),
                            custom_analyze_title(x.get('info') or u''),
                            x.get('akas')
                        ))]
    extractors = [Extractor(label='search',
                        path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
                        attrs=_attrs)]
    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):
        if self._notDirectHitTitle in html_string[:1024].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                html_string = html_string.replace('(TV mini-series)', '(mini)')
                html_string = html_string.replace('<p class="find-aka">',
                        '<p class="find-aka">::')
                #html_string = _reAKAStitles.sub(
                #        r'<div class="_imdbpyAKA">\1::</div>\2', html_string)
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
                                                                    title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        # Horrible hack to support AKAs.
        if data and data['data'] and len(data['data'][0]) == 3 and \
                isinstance(data['data'][0], tuple):
            for idx, datum in enumerate(data['data']):
                if datum[2] is not None:
                    akas = filter(None, datum[2].split('::'))
                    if self._linkPrefix == '/title/tt':
                        akas = [a.replace('" - ', '::').rstrip() for a in akas]
                        akas = [a.replace('aka "', '', 1).lstrip() for a in akas]
                    datum[1]['akas'] = akas
                    data['data'][idx] = (datum[0], datum[1])
                else:
                    data['data'][idx] = (datum[0], datum[1])
        return data

    def add_refs(self, data):
        return data
Example #23
0
         cDb = CompanyName.get(mdata[1])
         cDbTxt = cDb.name
         if cDb.countryCode:
             cDbTxt += ' %s' % cDb.countryCode
         company = Company(name=cDbTxt,
                           companyID=mdata[1],
                           notes=mdata[2] or u'',
                           accessSystem=self.accessSystem)
         res.setdefault(sect, []).append(company)
 # AKA titles.
 akat = [(get_movie_data(at.id, self._kind, fromAka=1), at.note)
         for at in AkaTitle.select(AkaTitle.q.movieID == movieID)]
 if akat:
     res['akas'] = []
     for td, note in akat:
         nt = build_title(td, canonical=1, ptdf=1)
         if note:
             net = self._changeAKAencoding(note, nt)
             if net is not None: nt = net
             nt += '::%s' % note
         if nt not in res['akas']: res['akas'].append(nt)
 # Complete cast/crew.
 compcast = [
     (self._compcast[cc.subjectID], self._compcast[cc.statusID])
     for cc in CompleteCast.select(CompleteCast.q.movieID == movieID)
 ]
 if compcast:
     for entry in compcast:
         val = unicode(entry[1])
         res[u'complete %s' % entry[0]] = val
 # Movie connections.
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>imdb title'
    _titleBuilder = lambda self, x: build_title(x, canonical=True)
    _linkPrefix = '/title/tt'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'info': ".//text()"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link') or u''),
                   analyze_title(x.get('info') or u'', canonical=1)))
    ]
    extractors = [
        Extractor(label='search',
                  path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
                  attrs=_attrs)
    ]

    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):
        if self._notDirectHitTitle in html_string[:1024].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                html_string = html_string.replace('(TV mini-series)', '(mini)')
                html_string = _reAKAS.sub('</td>', html_string)
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
                                                                       title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        return data

    def add_refs(self, data):
        return data
Example #25
0
def scan_titles(titles_list,
                title1,
                title2,
                title3,
                results=0,
                searchingEpisode=0,
                onlyEpisodes=0,
                ro_thresold=None):
    """Scan a list of titles, searching for best matches against
    the given variations."""
    if ro_thresold is not None: RO_THRESHOLD = ro_thresold
    else: RO_THRESHOLD = 0.6
    sm1 = SequenceMatcher()
    sm2 = SequenceMatcher()
    sm3 = SequenceMatcher()
    sm1.set_seq1(title1.lower())
    sm2.set_seq2(title2.lower())
    if title3:
        sm3.set_seq1(title3.lower())
        if title3[-1] == '}': searchingEpisode = 1
    hasArt = 0
    if title2 != title1: hasArt = 1
    resd = {}
    for i, t_data in titles_list:
        if onlyEpisodes:
            if t_data.get('kind') != 'episode':
                continue
            til = t_data['title']
            if til[-1] == ')':
                dateIdx = til.rfind('(')
                if dateIdx != -1:
                    til = til[:dateIdx].rstrip()
            if not til:
                continue
            ratio = ratcliff(title1, til, sm1)
            if ratio >= RO_THRESHOLD:
                resd[i] = (ratio, (i, t_data))
            continue
        if searchingEpisode:
            if t_data.get('kind') != 'episode': continue
        elif t_data.get('kind') == 'episode': continue
        til = t_data['title']
        # XXX: on Symbian, here we get a str; not sure this is the
        #      right place to fix it.
        if isinstance(til, str):
            til = unicode(til, 'latin1', 'ignore')
        # Distance with the canonical title (with or without article).
        #   titleS      -> titleR
        #   titleS, the -> titleR, the
        if not searchingEpisode:
            til = canonicalTitle(til)
            ratios = [ratcliff(title1, til, sm1) + 0.05]
            # til2 is til without the article, if present.
            til2 = til
            tils = til2.split(', ')
            matchHasArt = 0
            if tils[-1].lower() in _unicodeArticles:
                til2 = ', '.join(tils[:-1])
                matchHasArt = 1
            if hasArt and not matchHasArt:
                #   titleS[, the]  -> titleR
                ratios.append(ratcliff(title2, til, sm2))
            elif matchHasArt and not hasArt:
                #   titleS  -> titleR[, the]
                ratios.append(ratcliff(title1, til2, sm1))
        else:
            ratios = [0.0]
        if title3:
            # Distance with the long imdb canonical title.
            ratios.append(
                ratcliff(title3, build_title(t_data, canonical=1, ptdf=1), sm3)
                + 0.1)
        ratio = max(ratios)
        if ratio >= RO_THRESHOLD:
            if resd.has_key(i):
                if ratio > resd[i][0]:
                    resd[i] = (ratio, (i, t_data))
            else:
                resd[i] = (ratio, (i, t_data))
    res = resd.values()
    res.sort()
    res.reverse()
    if results > 0: res[:] = res[:results]
    return res
Example #26
0
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _titleBuilder = lambda self, x: build_title(x)
    _linkPrefix = '/title/tt'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'info': ".//text()",
                      'akas': "./i//text()"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link') or ''),
                   custom_analyze_title(x.get('info') or ''), x.get('akas')))
    ]

    extractors = [
        Extractor(label='search',
                  path="//td[@class='result_text']",
                  attrs=_attrs)
    ]

    def _init(self):
        self.url = ''

    def _reset(self):
        self.url = ''

    def postprocess_data(self, data):
        if 'data' not in data:
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        # Horrible hack to support AKAs.
        if data and data['data'] and len(data['data'][0]) == 3 and \
                isinstance(data['data'][0], tuple):
            data['data'] = [x for x in data['data'] if x[0] and x[1]]
            for idx, datum in enumerate(data['data']):
                if not isinstance(datum, tuple):
                    continue
                if not datum[0] and datum[1]:
                    continue
                if datum[2] is not None:
                    # akas = filter(None, datum[2].split('::'))
                    if self._linkPrefix == '/title/tt':
                        # XXX (HTU): couldn't find a result with multiple akas
                        aka = datum[2]
                        akas = [aka[1:-1]]  # remove the quotes
                        # akas = [a.replace('" - ', '::').rstrip() for a in akas]
                        # akas = [a.replace('aka "', '', 1).replace('aka  "',
                        #         '', 1).lstrip() for a in akas]
                    datum[1]['akas'] = akas
                    data['data'][idx] = (datum[0], datum[1])
                else:
                    data['data'][idx] = (datum[0], datum[1])
        return data

    def add_refs(self, data):
        return data
Example #27
0
def scan_titles(titles_list, title1, title2, title3, results=0,
                searchingEpisode=0, onlyEpisodes=0, ro_thresold=None):
    """Scan a list of titles, searching for best matches against
    the given variations."""
    if ro_thresold is not None: RO_THRESHOLD = ro_thresold
    else: RO_THRESHOLD = 0.6
    sm1 = SequenceMatcher()
    sm2 = SequenceMatcher()
    sm3 = SequenceMatcher()
    sm1.set_seq1(title1.lower())
    sm2.set_seq2(title2.lower())
    if title3:
        sm3.set_seq1(title3.lower())
        if title3[-1] == '}': searchingEpisode = 1
    hasArt = 0
    if title2 != title1: hasArt = 1
    resd = {}
    for i, t_data in titles_list:
        if onlyEpisodes:
            if t_data.get('kind') != 'episode':
                continue
            til = t_data['title']
            if til[-1] == ')':
                dateIdx = til.rfind('(')
                if dateIdx != -1:
                    til = til[:dateIdx].rstrip()
            if not til:
                continue
            ratio = ratcliff(title1, til, sm1)
            if ratio >= RO_THRESHOLD:
                resd[i] = (ratio, (i, t_data))
            continue
        if searchingEpisode:
            if t_data.get('kind') != 'episode': continue
        elif t_data.get('kind') == 'episode': continue
        til = t_data['title']
        # XXX: on Symbian, here we get a str; not sure this is the
        #      right place to fix it.
        if isinstance(til, str):
            til = unicode(til, 'latin1', 'ignore')
        # Distance with the canonical title (with or without article).
        #   titleS      -> titleR
        #   titleS, the -> titleR, the
        if not searchingEpisode:
            ratios = [ratcliff(title1, til, sm1) + 0.05]
            # til2 is til without the article, if present.
            til2 = til
            tils = til2.split(', ')
            matchHasArt = 0
            if tils[-1].lower() in _articles:
                til2 = ', '.join(tils[:-1])
                matchHasArt = 1
            if hasArt and not matchHasArt:
                #   titleS[, the]  -> titleR
                ratios.append(ratcliff(title2, til, sm2))
            elif matchHasArt and not hasArt:
                #   titleS  -> titleR[, the]
                ratios.append(ratcliff(title1, til2, sm1))
        else:
            ratios = [0.0]
        if title3:
            # Distance with the long imdb canonical title.
            ratios.append(ratcliff(title3,
                        build_title(t_data, canonical=1, ptdf=1), sm3) + 0.1)
        ratio = max(ratios)
        if ratio >= RO_THRESHOLD:
            if resd.has_key(i):
                if ratio > resd[i][0]:
                    resd[i] = (ratio, (i, t_data))
            else: resd[i] = (ratio, (i, t_data))
    res = resd.values()
    res.sort()
    res.reverse()
    if results > 0: res[:] = res[:results]
    return res