Beispiel #1
0
def create_index(dbfile, dbdir, debug=False):
    """Index the movie list for searching."""
    # Load ratings; number of ratings included in index for score weighting
    ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search()

    # Count word frequencies while outputting searchable list
    frequencies = Counter()
    #indexfh = ChunkedFile(dbfile, 'index', mode='a')
    indexfh = open_compressed(dbfile+'.idx', mode='w')

    # Index all IMDb titles
    skipped = 0
    for iterator in \
            (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(),
             parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()):
        last_time = None
        for obj in iterator:
            if len(obj) == 1:   # movies.list.gz
                data = parsers.parse_title(obj[0])
                akafor = ''
            else:               # aka-titles.list.gz
                data = parsers.parse_title(obj[1])  # AKA name of the title
                akafor = obj[0]             # Real name of the title
                # If it's a duplicate AKA (for indexing purposes), skip it.
                # The same AKA title may be repeated. For example:
                #     (aka Die Hard 4.0 (2007)) (UK)
                #     (aka Die Hard 4.0 (2007)) (Germany)
                if last_time and last_time[0:2] == obj[0:2]:
                    skipped += 1
                    continue
                last_time = obj
            searchable = _clean_word(data.name).split(' ')
            # Save word frequencies
            frequencies.update(searchable)
            # Determine rating for result ranking
            nratings = 0
            if akafor and akafor in ratings:
                nratings = ratings[akafor].nratings
            elif not akafor and data.title in ratings:
                nratings = ratings[data.title].nratings
            # Write movie to output
            indexfh.write("\t".join((''.join(searchable),
                                     data.year.encode('ascii')
                                     if data.year else '',
                                     data.title.encode('utf-8'),
                                     akafor.encode('utf-8'),
                                     str(nratings))))
            indexfh.write("\n")
    indexfh.close()
    #print "Skipped %d duplicate AKA titles" % skipped

    # Write frequencies to stopwords file
    if False:
        swf = ChunkedFile(dbfile, 'stopwords', mode='a')
        for word, numtimes in frequencies.most_common():
            swf.write("%s %d\n" % (word, numtimes))
        swf.close()
Beispiel #2
0
def _search_index(timer, dbfile, words, size, strip_stems=True,
                  year=None, deltayear=8, debug=False):
    """Yield a subset of the database that somewhat matches words.
    Returns any movies that contains a subword of any of words.
    (See the _subwords function.) Shorter subwords means more results, but
    slower performance.

    words -- List of words.
    size -- Length of subwords to use for search. (See _subwords function.)
    strip_stems -- Omit really common subwords. (See _subwords function.)
    year -- A guess of the year. Only returns movies dated near year.
    deltayear -- Only return movies with year [year-deltayear,year+deltayear].
    """
    # Extract a plausible-looking subset of the database so we don't
    # have to run SequenceMatcher on everything. This works pretty
    # well, except for movies like O (2001).

    # A list of plain-text strings that we expect to find in the
    # SEARCHABLE field of the data. We will require at least one of
    # these to be present.
    wordlist = tuple(_subwords(_clean_words(words, strip_stems), size))
    # If we are provided with an estimated year, compose a list of
    # acceptable years.
    validyears = range(year-deltayear, year+deltayear) if year else ()
    if debug:
        print wordlist
        print "Searching..."

    # Reading lines out of a GzipFile is very slow; using gzip(1) is ~6.5x
    # faster. For further speedup, we could use zgrep(1) to extract our
    # subset using grep(1).
    #indexfh = ChunkedFile(dbfile, 'index')
    indexfh = open_compressed(dbfile+'.idx')
    #indexfh = open('idx.tmp')

    for i, line in enumerate(indexfh):
        # Quick check to determine if the entry matches any of our words
        # (grep -F is faster; grep -E might be faster still)
        for word in wordlist:
            if word in line:
                break
        else:
            continue

        # Get SEARCHABLE\tYEAR\tTITLE
        ryear, title, akafor, nratings = line.decode('utf-8').split('\t')[1:]

        # Check that the year is within tolerances
        if validyears and ryear and int(ryear) not in validyears:
            continue

        yield title, ryear, akafor, nratings
        if i % 100 == 0:
            timer.step()
    indexfh.close()
    if debug:
        print 'Completed search in', timer, 'seconds.'
def create_index(dbfile, dbdir, debug=False):
    """Index the movie list for searching."""
    # Load ratings; number of ratings included in index for score weighting
    ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search()

    # Count word frequencies while outputting searchable list
    frequencies = Counter()
    #indexfh = ChunkedFile(dbfile, 'index', mode='a')
    indexfh = open_compressed(dbfile + '.idx', mode='w')

    # Index all IMDb titles
    skipped = 0
    for iterator in \
            (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(),
             parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()):
        last_time = None
        for obj in iterator:
            if len(obj) == 1:  # movies.list.gz
                data = parsers.parse_title(obj[0])
                akafor = ''
            else:  # aka-titles.list.gz
                data = parsers.parse_title(obj[1])  # AKA name of the title
                akafor = obj[0]  # Real name of the title
                # If it's a duplicate AKA (for indexing purposes), skip it.
                # The same AKA title may be repeated. For example:
                #     (aka Die Hard 4.0 (2007)) (UK)
                #     (aka Die Hard 4.0 (2007)) (Germany)
                if last_time and last_time[0:2] == obj[0:2]:
                    skipped += 1
                    continue
                last_time = obj
            searchable = _clean_word(data.name).split(' ')
            # Save word frequencies
            frequencies.update(searchable)
            # Determine rating for result ranking
            nratings = 0
            if akafor and akafor in ratings:
                nratings = ratings[akafor].nratings
            elif not akafor and data.title in ratings:
                nratings = ratings[data.title].nratings
            # Write movie to output
            indexfh.write("\t".join(
                (''.join(searchable),
                 data.year.encode('ascii') if data.year else '',
                 data.title.encode('utf-8'), akafor.encode('utf-8'),
                 str(nratings))))
            indexfh.write("\n")
    indexfh.close()
    #print "Skipped %d duplicate AKA titles" % skipped

    # Write frequencies to stopwords file
    if False:
        swf = ChunkedFile(dbfile, 'stopwords', mode='a')
        for word, numtimes in frequencies.most_common():
            swf.write("%s %d\n" % (word, numtimes))
        swf.close()
def _search_index(timer,
                  dbfile,
                  words,
                  size,
                  strip_stems=True,
                  year=None,
                  deltayear=8,
                  debug=False):
    """Yield a subset of the database that somewhat matches words.
    Returns any movies that contains a subword of any of words.
    (See the _subwords function.) Shorter subwords means more results, but
    slower performance.

    words -- List of words.
    size -- Length of subwords to use for search. (See _subwords function.)
    strip_stems -- Omit really common subwords. (See _subwords function.)
    year -- A guess of the year. Only returns movies dated near year.
    deltayear -- Only return movies with year [year-deltayear,year+deltayear].
    """
    # Extract a plausible-looking subset of the database so we don't
    # have to run SequenceMatcher on everything. This works pretty
    # well, except for movies like O (2001).

    # A list of plain-text strings that we expect to find in the
    # SEARCHABLE field of the data. We will require at least one of
    # these to be present.
    wordlist = tuple(_subwords(_clean_words(words, strip_stems), size))
    # If we are provided with an estimated year, compose a list of
    # acceptable years.
    validyears = range(year - deltayear, year + deltayear) if year else ()
    if debug:
        print wordlist
        # print "Searching..."

    # Reading lines out of a GzipFile is very slow; using gzip(1) is ~6.5x
    # faster. For further speedup, we could use zgrep(1) to extract our
    # subset using grep(1).
    #indexfh = ChunkedFile(dbfile, 'index')
    indexfh = open_compressed(dbfile + '.idx')
    #indexfh = open('idx.tmp')

    for i, line in enumerate(indexfh):
        # Quick check to determine if the entry matches any of our words
        # (grep -F is faster; grep -E might be faster still)
        for word in wordlist:
            if word in line:
                break
        else:
            continue

        # Get SEARCHABLE\tYEAR\tTITLE
        ryear, title, akafor, nratings = line.decode('utf-8').split('\t')[1:]

        # Check that the year is within tolerances
        if validyears and ryear and int(ryear) not in validyears:
            continue

        yield title, ryear, akafor, nratings
        if i % 100 == 0:
            timer.step()
    indexfh.close()
Beispiel #5
0
    def _run_search(self, queries):
        """Return items from the data file matching any item in queries."""
        if queries is not None:
            queries = set(queries)
            # Don't do anything if an empty set is provided
            if not queries:
                return

        # Open the compressed database, either copied version or original file.
        if self.dbfile:
            fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
        else:
            assert(len(self.origfiles) == 1)
            try:
                fileobj = open_compressed(self.origfiles[0])
            except IOError as e:
                print "Skipping %s: %s" % (self.origfiles[0], e.strerror)
                return
            self._skip_header(fileobj)
        if self.debug:
            print "Reading %s..." % self.listname

        # Locate seek positions for all queries
        if queries and self.indexname:  # Use index
            locs = list(_find_seeks_index(self.dbfile, self.indexname, queries,
                                          debug=self.debug))
        elif queries:                   # Use bookmarks
            locs = list(_find_seeks_bookmarks(fileobj, queries,
                                              debug=self.debug))
        else:
            locs = [(None, None, 1)]     # Dummy values to start loop

        # Read selected lines from the file
        timer = Timer()
        loc = 0
        for startloc, endloc, nresults in locs:
            # Skip to the correct position in the file
            if queries:
                if startloc > loc:
                    #print "  Seek to", startloc
                    fileobj.seek(startloc)
                    loc = fileobj.tell()
                elif startloc < loc:
                    #print "  Skipping", startloc, "already at", loc
                    continue
                #else:
                #    print "  Skipping", startloc, "already there"
                #print "    Finish at", endloc, "after", nresults, "results"
            for _ in xrange(nresults):
                # Parse the file until we get a result
                for i, line in enumerate(fileobj):
                    # Determine if we have reached the end location for this
                    # section
                    if endloc and loc == endloc:
                        break
                    #assert(not endloc or loc < endloc)

                    # Do not index video games or individual TV episodes
                    # (Not applicable for all file types)
                    if not self.dbfile and self.skip_tvvg and \
                            ('(VG)' in line or '{' in line):
                        #loc = fileobj.tell() # Don't seek/tell in gzip
                        continue
                    # Decode database (IMDb databases use ISO-8859-1)
                    line = line.rstrip().decode('iso-8859-1')

                    if queries and i % 100 == 0:
                        timer.step()

                    data = self._parse_line(line, loc)
                    if self.dbfile:
                        loc = fileobj.tell()

                    if data is None:
                        break           # End of database
                    if not data:
                        continue        # Skip this line

                    # Check if one of our queries matches
                    if queries is None or data[0] in queries:
                        yield self._make_result(data)
                        if queries is not None:
                            # queries.remove(data[0])
                            break

        if self.debug:
            print 'Completed in', timer, 'seconds.'
        fileobj.close()
Beispiel #6
0
    def rebuild_index(self, do_copy=True):
        """Create an index for this file, to allow rapid seeking to information
        about a given title."""
        if do_copy:
            copy_to = ChunkedFile(self.dbfile, self.listname, mode='a',
                                  autoflush=True if self.indexname else False)
            tellobj = copy_to
            filenames = self.origfiles
        else:
            #filenames = ???
            copy_to = None
            raise NotImplementedError

        indexobj = defaultdict(list)

        for filename in filenames:
            if do_copy:
                try:
                    fileobj = open_compressed(filename)
                except IOError as e:
                    print "  Skipping %s: %s" % (filename, e.strerror)
                    continue
            else:
                fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
                tellobj = fileobj

            self._skip_header(fileobj)
            # Get location of this line
            loc = tellobj.tell()
            for line in fileobj:
                # Do not index video games or individual TV episodes
                # (Not applicable for all file types)
                if self.skip_tvvg and ('(VG)' in line or '{' in line):
                    continue
                if copy_to:
                    copy_to.write(line)
                # Decode database (IMDb databases use ISO-8859-1)
                line = line.rstrip().decode('iso-8859-1')

                data = self._parse_line(line, loc)
                loc = tellobj.tell()
                if data is None:
                    break           # End of database
                if not data:
                    continue        # Skip this line

                # Add to the index
                title, idxline = data[0:2] #self._make_locator(data)
                title = title.encode('utf-8')
                if self.indexname:
                    indexobj[title].append(idxline)
                elif copy_to:
                    copy_to.bookmark(title)
            fileobj.close()
        if copy_to:
            copy_to.close()

        if self.indexname:
            # Write out a separate index, if required (e.g. names databases)
            indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a',
                                  autoflush=False)
            for title, linenos in sorted(indexobj.items()):
                indexfh.write(title)
                indexfh.write("\t")
                indexfh.write(' '.join(str(i) for i in linenos))
                indexfh.write("\n")
                indexfh.bookmark(title)
            indexfh.close()
        else:
            # An index is required to use more than one file, since the
            # resulting combination will not be sorted
            assert(len(filenames) == 1)
    def _run_search(self, queries):
        """Return items from the data file matching any item in queries."""
        if queries is not None:
            queries = set(queries)
            # Don't do anything if an empty set is provided
            if not queries:
                return

        # Open the compressed database, either copied version or original file.
        if self.dbfile:
            fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
        else:
            assert (len(self.origfiles) == 1)
            try:
                fileobj = open_compressed(self.origfiles[0])
            except IOError as e:
                print "Skipping %s: %s" % (self.origfiles[0], e.strerror)
                return
            self._skip_header(fileobj)
        # if self.debug:
        #     print "Reading %s..." % self.listname

        # Locate seek positions for all queries
        if queries and self.indexname:  # Use index
            locs = list(
                _find_seeks_index(self.dbfile,
                                  self.indexname,
                                  queries,
                                  debug=self.debug))
        elif queries:  # Use bookmarks
            locs = list(
                _find_seeks_bookmarks(fileobj, queries, debug=self.debug))
        else:
            locs = [(None, None, 1)]  # Dummy values to start loop

        # Read selected lines from the file
        timer = Timer()
        loc = 0
        for startloc, endloc, nresults in locs:
            # Skip to the correct position in the file
            if queries:
                if startloc > loc:
                    #print "  Seek to", startloc
                    fileobj.seek(startloc)
                    loc = fileobj.tell()
                elif startloc < loc:
                    #print "  Skipping", startloc, "already at", loc
                    continue
                #else:
                #    print "  Skipping", startloc, "already there"
                #print "    Finish at", endloc, "after", nresults, "results"
            for _ in xrange(nresults):
                # Parse the file until we get a result
                for i, line in enumerate(fileobj):
                    # Determine if we have reached the end location for this
                    # section
                    if endloc and loc == endloc:
                        break
                    #assert(not endloc or loc < endloc)

                    # Do not index video games or individual TV episodes
                    # (Not applicable for all file types)
                    if not self.dbfile and self.skip_tvvg and \
                            ('(VG)' in line or '{' in line):
                        #loc = fileobj.tell() # Don't seek/tell in gzip
                        continue
                    # Decode database (IMDb databases use ISO-8859-1)
                    line = line.rstrip().decode('iso-8859-1')

                    if queries and i % 100 == 0:
                        timer.step()

                    data = self._parse_line(line, loc)
                    if self.dbfile:
                        loc = fileobj.tell()

                    if data is None:
                        break  # End of database
                    if not data:
                        continue  # Skip this line

                    # Check if one of our queries matches
                    if queries is None or data[0] in queries:
                        yield self._make_result(data)
                        if queries is not None:
                            # queries.remove(data[0])
                            break

        # if self.debug:
        #     print 'Completed in', timer, 'seconds.'
        fileobj.close()
    def rebuild_index(self, do_copy=True):
        """Create an index for this file, to allow rapid seeking to information
        about a given title."""
        if do_copy:
            copy_to = ChunkedFile(self.dbfile,
                                  self.listname,
                                  mode='a',
                                  autoflush=True if self.indexname else False)
            tellobj = copy_to
            filenames = self.origfiles
        else:
            #filenames = ???
            copy_to = None
            raise NotImplementedError

        indexobj = defaultdict(list)

        for filename in filenames:
            if do_copy:
                try:
                    fileobj = open_compressed(filename)
                except IOError as e:
                    print "  Skipping %s: %s" % (filename, e.strerror)
                    continue
            else:
                fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
                tellobj = fileobj

            self._skip_header(fileobj)
            # Get location of this line
            loc = tellobj.tell()
            for line in fileobj:
                # Do not index video games or individual TV episodes
                # (Not applicable for all file types)
                if self.skip_tvvg and ('(VG)' in line or '{' in line):
                    continue
                if copy_to:
                    copy_to.write(line)
                # Decode database (IMDb databases use ISO-8859-1)
                line = line.rstrip().decode('iso-8859-1')

                data = self._parse_line(line, loc)
                loc = tellobj.tell()
                if data is None:
                    break  # End of database
                if not data:
                    continue  # Skip this line

                # Add to the index
                title, idxline = data[0:2]  #self._make_locator(data)
                title = title.encode('utf-8')
                if self.indexname:
                    indexobj[title].append(idxline)
                elif copy_to:
                    copy_to.bookmark(title)
            fileobj.close()
        if copy_to:
            copy_to.close()

        if self.indexname:
            # Write out a separate index, if required (e.g. names databases)
            indexfh = ChunkedFile(self.dbfile,
                                  self.indexname,
                                  mode='a',
                                  autoflush=False)
            for title, linenos in sorted(indexobj.items()):
                indexfh.write(title)
                indexfh.write("\t")
                indexfh.write(' '.join(str(i) for i in linenos))
                indexfh.write("\n")
                indexfh.bookmark(title)
            indexfh.close()
        else:
            # An index is required to use more than one file, since the
            # resulting combination will not be sorted
            assert (len(filenames) == 1)