Exemple #1
0
def create_index(dbfile, dbdir, debug=False):
    """Index the movie list for searching."""
    # Load ratings; number of ratings included in index for score weighting
    ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search()

    # Count word frequencies while outputting searchable list
    frequencies = Counter()
    #indexfh = ChunkedFile(dbfile, 'index', mode='a')
    indexfh = open_compressed(dbfile+'.idx', mode='w')

    # Index all IMDb titles
    skipped = 0
    for iterator in \
            (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(),
             parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()):
        last_time = None
        for obj in iterator:
            if len(obj) == 1:   # movies.list.gz
                data = parsers.parse_title(obj[0])
                akafor = ''
            else:               # aka-titles.list.gz
                data = parsers.parse_title(obj[1])  # AKA name of the title
                akafor = obj[0]             # Real name of the title
                # If it's a duplicate AKA (for indexing purposes), skip it.
                # The same AKA title may be repeated. For example:
                #     (aka Die Hard 4.0 (2007)) (UK)
                #     (aka Die Hard 4.0 (2007)) (Germany)
                if last_time and last_time[0:2] == obj[0:2]:
                    skipped += 1
                    continue
                last_time = obj
            searchable = _clean_word(data.name).split(' ')
            # Save word frequencies
            frequencies.update(searchable)
            # Determine rating for result ranking
            nratings = 0
            if akafor and akafor in ratings:
                nratings = ratings[akafor].nratings
            elif not akafor and data.title in ratings:
                nratings = ratings[data.title].nratings
            # Write movie to output
            indexfh.write("\t".join((''.join(searchable),
                                     data.year.encode('ascii')
                                     if data.year else '',
                                     data.title.encode('utf-8'),
                                     akafor.encode('utf-8'),
                                     str(nratings))))
            indexfh.write("\n")
    indexfh.close()
    #print "Skipped %d duplicate AKA titles" % skipped

    # Write frequencies to stopwords file
    if False:
        swf = ChunkedFile(dbfile, 'stopwords', mode='a')
        for word, numtimes in frequencies.most_common():
            swf.write("%s %d\n" % (word, numtimes))
        swf.close()
def _find_seeks_index(dbfile, indexname, queries, debug=False):
    """Use the index file to find exact seek positions for relevant
    records. End locations are not necessary since we are guaranteed that
    the data will be present, so a number of occurances is sufficient for
    prompt termination."""
    timer = Timer(rl_min_dur=1)
    locs = Counter()
    # if debug:
    #     # print "  Searching index..."
    #     print ""
    indexfh = ChunkedFile(dbfile, indexname, mode='r')
    last_bookmark = 0
    for query in sorted(queries):
        # Use bookmarks to rapidly search the index!
        bookmark = indexfh.find_bookmark(query.encode('utf-8'))
        if bookmark != last_bookmark:
            indexfh.seek(bookmark)
            #print "  Seek to", bookmark
            last_bookmark = bookmark
        for i, line in enumerate(indexfh):
            title, nums = line.decode('utf-8').split('\t')
            if i % 100 == 0:
                timer.step()
            if title in queries:
                locs.update(int(x) for x in nums.split(' '))
            elif title > query:
                break  # This works because the index is sorted.
    indexfh.close()
    for start, nresults in sorted(locs.items()):
        yield (start, None, nresults)
Exemple #3
0
def _find_seeks_index(dbfile, indexname, queries, debug=False):
    """Use the index file to find exact seek positions for relevant
    records. End locations are not necessary since we are guaranteed that
    the data will be present, so a number of occurances is sufficient for
    prompt termination."""
    timer = Timer(rl_min_dur=1)
    locs = Counter()
    if debug:
        print "  Searching index..."
    indexfh = ChunkedFile(dbfile, indexname, mode='r')
    last_bookmark = 0
    for query in sorted(queries):
        # Use bookmarks to rapidly search the index!
        bookmark = indexfh.find_bookmark(query.encode('utf-8'))
        if bookmark != last_bookmark:
            indexfh.seek(bookmark)
            #print "  Seek to", bookmark
            last_bookmark = bookmark
        for i, line in enumerate(indexfh):
            title, nums = line.decode('utf-8').split('\t')
            if i % 100 == 0:
                timer.step()
            if title in queries:
                locs.update(int(x) for x in nums.split(' '))
            elif title > query:
                break   # This works because the index is sorted.
    indexfh.close()
    for start, nresults in sorted(locs.items()):
        yield (start, None, nresults)
    if debug:
        print '  Completed in', timer, 'seconds.'
def create_index(dbfile, dbdir, debug=False):
    """Index the movie list for searching."""
    # Load ratings; number of ratings included in index for score weighting
    ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search()

    # Count word frequencies while outputting searchable list
    frequencies = Counter()
    #indexfh = ChunkedFile(dbfile, 'index', mode='a')
    indexfh = open_compressed(dbfile + '.idx', mode='w')

    # Index all IMDb titles
    skipped = 0
    for iterator in \
            (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(),
             parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()):
        last_time = None
        for obj in iterator:
            if len(obj) == 1:  # movies.list.gz
                data = parsers.parse_title(obj[0])
                akafor = ''
            else:  # aka-titles.list.gz
                data = parsers.parse_title(obj[1])  # AKA name of the title
                akafor = obj[0]  # Real name of the title
                # If it's a duplicate AKA (for indexing purposes), skip it.
                # The same AKA title may be repeated. For example:
                #     (aka Die Hard 4.0 (2007)) (UK)
                #     (aka Die Hard 4.0 (2007)) (Germany)
                if last_time and last_time[0:2] == obj[0:2]:
                    skipped += 1
                    continue
                last_time = obj
            searchable = _clean_word(data.name).split(' ')
            # Save word frequencies
            frequencies.update(searchable)
            # Determine rating for result ranking
            nratings = 0
            if akafor and akafor in ratings:
                nratings = ratings[akafor].nratings
            elif not akafor and data.title in ratings:
                nratings = ratings[data.title].nratings
            # Write movie to output
            indexfh.write("\t".join(
                (''.join(searchable),
                 data.year.encode('ascii') if data.year else '',
                 data.title.encode('utf-8'), akafor.encode('utf-8'),
                 str(nratings))))
            indexfh.write("\n")
    indexfh.close()
    #print "Skipped %d duplicate AKA titles" % skipped

    # Write frequencies to stopwords file
    if False:
        swf = ChunkedFile(dbfile, 'stopwords', mode='a')
        for word, numtimes in frequencies.most_common():
            swf.write("%s %d\n" % (word, numtimes))
        swf.close()
Exemple #5
0
    def _run_search(self, queries):
        """Return items from the data file matching any item in queries."""
        if queries is not None:
            queries = set(queries)
            # Don't do anything if an empty set is provided
            if not queries:
                return

        # Open the compressed database, either copied version or original file.
        if self.dbfile:
            fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
        else:
            assert(len(self.origfiles) == 1)
            try:
                fileobj = open_compressed(self.origfiles[0])
            except IOError as e:
                print "Skipping %s: %s" % (self.origfiles[0], e.strerror)
                return
            self._skip_header(fileobj)
        if self.debug:
            print "Reading %s..." % self.listname

        # Locate seek positions for all queries
        if queries and self.indexname:  # Use index
            locs = list(_find_seeks_index(self.dbfile, self.indexname, queries,
                                          debug=self.debug))
        elif queries:                   # Use bookmarks
            locs = list(_find_seeks_bookmarks(fileobj, queries,
                                              debug=self.debug))
        else:
            locs = [(None, None, 1)]     # Dummy values to start loop

        # Read selected lines from the file
        timer = Timer()
        loc = 0
        for startloc, endloc, nresults in locs:
            # Skip to the correct position in the file
            if queries:
                if startloc > loc:
                    #print "  Seek to", startloc
                    fileobj.seek(startloc)
                    loc = fileobj.tell()
                elif startloc < loc:
                    #print "  Skipping", startloc, "already at", loc
                    continue
                #else:
                #    print "  Skipping", startloc, "already there"
                #print "    Finish at", endloc, "after", nresults, "results"
            for _ in xrange(nresults):
                # Parse the file until we get a result
                for i, line in enumerate(fileobj):
                    # Determine if we have reached the end location for this
                    # section
                    if endloc and loc == endloc:
                        break
                    #assert(not endloc or loc < endloc)

                    # Do not index video games or individual TV episodes
                    # (Not applicable for all file types)
                    if not self.dbfile and self.skip_tvvg and \
                            ('(VG)' in line or '{' in line):
                        #loc = fileobj.tell() # Don't seek/tell in gzip
                        continue
                    # Decode database (IMDb databases use ISO-8859-1)
                    line = line.rstrip().decode('iso-8859-1')

                    if queries and i % 100 == 0:
                        timer.step()

                    data = self._parse_line(line, loc)
                    if self.dbfile:
                        loc = fileobj.tell()

                    if data is None:
                        break           # End of database
                    if not data:
                        continue        # Skip this line

                    # Check if one of our queries matches
                    if queries is None or data[0] in queries:
                        yield self._make_result(data)
                        if queries is not None:
                            # queries.remove(data[0])
                            break

        if self.debug:
            print 'Completed in', timer, 'seconds.'
        fileobj.close()
Exemple #6
0
    def rebuild_index(self, do_copy=True):
        """Create an index for this file, to allow rapid seeking to information
        about a given title."""
        if do_copy:
            copy_to = ChunkedFile(self.dbfile, self.listname, mode='a',
                                  autoflush=True if self.indexname else False)
            tellobj = copy_to
            filenames = self.origfiles
        else:
            #filenames = ???
            copy_to = None
            raise NotImplementedError

        indexobj = defaultdict(list)

        for filename in filenames:
            if do_copy:
                try:
                    fileobj = open_compressed(filename)
                except IOError as e:
                    print "  Skipping %s: %s" % (filename, e.strerror)
                    continue
            else:
                fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
                tellobj = fileobj

            self._skip_header(fileobj)
            # Get location of this line
            loc = tellobj.tell()
            for line in fileobj:
                # Do not index video games or individual TV episodes
                # (Not applicable for all file types)
                if self.skip_tvvg and ('(VG)' in line or '{' in line):
                    continue
                if copy_to:
                    copy_to.write(line)
                # Decode database (IMDb databases use ISO-8859-1)
                line = line.rstrip().decode('iso-8859-1')

                data = self._parse_line(line, loc)
                loc = tellobj.tell()
                if data is None:
                    break           # End of database
                if not data:
                    continue        # Skip this line

                # Add to the index
                title, idxline = data[0:2] #self._make_locator(data)
                title = title.encode('utf-8')
                if self.indexname:
                    indexobj[title].append(idxline)
                elif copy_to:
                    copy_to.bookmark(title)
            fileobj.close()
        if copy_to:
            copy_to.close()

        if self.indexname:
            # Write out a separate index, if required (e.g. names databases)
            indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a',
                                  autoflush=False)
            for title, linenos in sorted(indexobj.items()):
                indexfh.write(title)
                indexfh.write("\t")
                indexfh.write(' '.join(str(i) for i in linenos))
                indexfh.write("\n")
                indexfh.bookmark(title)
            indexfh.close()
        else:
            # An index is required to use more than one file, since the
            # resulting combination will not be sorted
            assert(len(filenames) == 1)
    def _run_search(self, queries):
        """Return items from the data file matching any item in queries."""
        if queries is not None:
            queries = set(queries)
            # Don't do anything if an empty set is provided
            if not queries:
                return

        # Open the compressed database, either copied version or original file.
        if self.dbfile:
            fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
        else:
            assert (len(self.origfiles) == 1)
            try:
                fileobj = open_compressed(self.origfiles[0])
            except IOError as e:
                print "Skipping %s: %s" % (self.origfiles[0], e.strerror)
                return
            self._skip_header(fileobj)
        # if self.debug:
        #     print "Reading %s..." % self.listname

        # Locate seek positions for all queries
        if queries and self.indexname:  # Use index
            locs = list(
                _find_seeks_index(self.dbfile,
                                  self.indexname,
                                  queries,
                                  debug=self.debug))
        elif queries:  # Use bookmarks
            locs = list(
                _find_seeks_bookmarks(fileobj, queries, debug=self.debug))
        else:
            locs = [(None, None, 1)]  # Dummy values to start loop

        # Read selected lines from the file
        timer = Timer()
        loc = 0
        for startloc, endloc, nresults in locs:
            # Skip to the correct position in the file
            if queries:
                if startloc > loc:
                    #print "  Seek to", startloc
                    fileobj.seek(startloc)
                    loc = fileobj.tell()
                elif startloc < loc:
                    #print "  Skipping", startloc, "already at", loc
                    continue
                #else:
                #    print "  Skipping", startloc, "already there"
                #print "    Finish at", endloc, "after", nresults, "results"
            for _ in xrange(nresults):
                # Parse the file until we get a result
                for i, line in enumerate(fileobj):
                    # Determine if we have reached the end location for this
                    # section
                    if endloc and loc == endloc:
                        break
                    #assert(not endloc or loc < endloc)

                    # Do not index video games or individual TV episodes
                    # (Not applicable for all file types)
                    if not self.dbfile and self.skip_tvvg and \
                            ('(VG)' in line or '{' in line):
                        #loc = fileobj.tell() # Don't seek/tell in gzip
                        continue
                    # Decode database (IMDb databases use ISO-8859-1)
                    line = line.rstrip().decode('iso-8859-1')

                    if queries and i % 100 == 0:
                        timer.step()

                    data = self._parse_line(line, loc)
                    if self.dbfile:
                        loc = fileobj.tell()

                    if data is None:
                        break  # End of database
                    if not data:
                        continue  # Skip this line

                    # Check if one of our queries matches
                    if queries is None or data[0] in queries:
                        yield self._make_result(data)
                        if queries is not None:
                            # queries.remove(data[0])
                            break

        # if self.debug:
        #     print 'Completed in', timer, 'seconds.'
        fileobj.close()
    def rebuild_index(self, do_copy=True):
        """Create an index for this file, to allow rapid seeking to information
        about a given title."""
        if do_copy:
            copy_to = ChunkedFile(self.dbfile,
                                  self.listname,
                                  mode='a',
                                  autoflush=True if self.indexname else False)
            tellobj = copy_to
            filenames = self.origfiles
        else:
            #filenames = ???
            copy_to = None
            raise NotImplementedError

        indexobj = defaultdict(list)

        for filename in filenames:
            if do_copy:
                try:
                    fileobj = open_compressed(filename)
                except IOError as e:
                    print "  Skipping %s: %s" % (filename, e.strerror)
                    continue
            else:
                fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
                tellobj = fileobj

            self._skip_header(fileobj)
            # Get location of this line
            loc = tellobj.tell()
            for line in fileobj:
                # Do not index video games or individual TV episodes
                # (Not applicable for all file types)
                if self.skip_tvvg and ('(VG)' in line or '{' in line):
                    continue
                if copy_to:
                    copy_to.write(line)
                # Decode database (IMDb databases use ISO-8859-1)
                line = line.rstrip().decode('iso-8859-1')

                data = self._parse_line(line, loc)
                loc = tellobj.tell()
                if data is None:
                    break  # End of database
                if not data:
                    continue  # Skip this line

                # Add to the index
                title, idxline = data[0:2]  #self._make_locator(data)
                title = title.encode('utf-8')
                if self.indexname:
                    indexobj[title].append(idxline)
                elif copy_to:
                    copy_to.bookmark(title)
            fileobj.close()
        if copy_to:
            copy_to.close()

        if self.indexname:
            # Write out a separate index, if required (e.g. names databases)
            indexfh = ChunkedFile(self.dbfile,
                                  self.indexname,
                                  mode='a',
                                  autoflush=False)
            for title, linenos in sorted(indexobj.items()):
                indexfh.write(title)
                indexfh.write("\t")
                indexfh.write(' '.join(str(i) for i in linenos))
                indexfh.write("\n")
                indexfh.bookmark(title)
            indexfh.close()
        else:
            # An index is required to use more than one file, since the
            # resulting combination will not be sorted
            assert (len(filenames) == 1)