예제 #1
0
def _find_seeks_index(dbfile, indexname, queries, debug=False):
    """Use the index file to find exact seek positions for relevant
    records. End locations are not necessary since we are guaranteed that
    the data will be present, so a number of occurances is sufficient for
    prompt termination."""
    timer = Timer(rl_min_dur=1)
    locs = Counter()
    if debug:
        print "  Searching index..."
    indexfh = ChunkedFile(dbfile, indexname, mode='r')
    last_bookmark = 0
    for query in sorted(queries):
        # Use bookmarks to rapidly search the index!
        bookmark = indexfh.find_bookmark(query.encode('utf-8'))
        if bookmark != last_bookmark:
            indexfh.seek(bookmark)
            #print "  Seek to", bookmark
            last_bookmark = bookmark
        for i, line in enumerate(indexfh):
            title, nums = line.decode('utf-8').split('\t')
            if i % 100 == 0:
                timer.step()
            if title in queries:
                locs.update(int(x) for x in nums.split(' '))
            elif title > query:
                break   # This works because the index is sorted.
    indexfh.close()
    for start, nresults in sorted(locs.items()):
        yield (start, None, nresults)
    if debug:
        print '  Completed in', timer, 'seconds.'
예제 #2
0
def _find_seeks_index(dbfile, indexname, queries, debug=False):
    """Use the index file to find exact seek positions for relevant
    records. End locations are not necessary since we are guaranteed that
    the data will be present, so a number of occurances is sufficient for
    prompt termination."""
    timer = Timer(rl_min_dur=1)
    locs = Counter()
    # if debug:
    #     # print "  Searching index..."
    #     print ""
    indexfh = ChunkedFile(dbfile, indexname, mode='r')
    last_bookmark = 0
    for query in sorted(queries):
        # Use bookmarks to rapidly search the index!
        bookmark = indexfh.find_bookmark(query.encode('utf-8'))
        if bookmark != last_bookmark:
            indexfh.seek(bookmark)
            #print "  Seek to", bookmark
            last_bookmark = bookmark
        for i, line in enumerate(indexfh):
            title, nums = line.decode('utf-8').split('\t')
            if i % 100 == 0:
                timer.step()
            if title in queries:
                locs.update(int(x) for x in nums.split(' '))
            elif title > query:
                break  # This works because the index is sorted.
    indexfh.close()
    for start, nresults in sorted(locs.items()):
        yield (start, None, nresults)
예제 #3
0
    def _run_search(self, queries):
        """Return items from the data file matching any item in queries."""
        if queries is not None:
            queries = set(queries)
            # Don't do anything if an empty set is provided
            if not queries:
                return

        # Open the compressed database, either copied version or original file.
        if self.dbfile:
            fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
        else:
            assert(len(self.origfiles) == 1)
            try:
                fileobj = open_compressed(self.origfiles[0])
            except IOError as e:
                print "Skipping %s: %s" % (self.origfiles[0], e.strerror)
                return
            self._skip_header(fileobj)
        if self.debug:
            print "Reading %s..." % self.listname

        # Locate seek positions for all queries
        if queries and self.indexname:  # Use index
            locs = list(_find_seeks_index(self.dbfile, self.indexname, queries,
                                          debug=self.debug))
        elif queries:                   # Use bookmarks
            locs = list(_find_seeks_bookmarks(fileobj, queries,
                                              debug=self.debug))
        else:
            locs = [(None, None, 1)]     # Dummy values to start loop

        # Read selected lines from the file
        timer = Timer()
        loc = 0
        for startloc, endloc, nresults in locs:
            # Skip to the correct position in the file
            if queries:
                if startloc > loc:
                    #print "  Seek to", startloc
                    fileobj.seek(startloc)
                    loc = fileobj.tell()
                elif startloc < loc:
                    #print "  Skipping", startloc, "already at", loc
                    continue
                #else:
                #    print "  Skipping", startloc, "already there"
                #print "    Finish at", endloc, "after", nresults, "results"
            for _ in xrange(nresults):
                # Parse the file until we get a result
                for i, line in enumerate(fileobj):
                    # Determine if we have reached the end location for this
                    # section
                    if endloc and loc == endloc:
                        break
                    #assert(not endloc or loc < endloc)

                    # Do not index video games or individual TV episodes
                    # (Not applicable for all file types)
                    if not self.dbfile and self.skip_tvvg and \
                            ('(VG)' in line or '{' in line):
                        #loc = fileobj.tell() # Don't seek/tell in gzip
                        continue
                    # Decode database (IMDb databases use ISO-8859-1)
                    line = line.rstrip().decode('iso-8859-1')

                    if queries and i % 100 == 0:
                        timer.step()

                    data = self._parse_line(line, loc)
                    if self.dbfile:
                        loc = fileobj.tell()

                    if data is None:
                        break           # End of database
                    if not data:
                        continue        # Skip this line

                    # Check if one of our queries matches
                    if queries is None or data[0] in queries:
                        yield self._make_result(data)
                        if queries is not None:
                            # queries.remove(data[0])
                            break

        if self.debug:
            print 'Completed in', timer, 'seconds.'
        fileobj.close()
예제 #4
0
    def _run_search(self, queries):
        """Return items from the data file matching any item in queries."""
        if queries is not None:
            queries = set(queries)
            # Don't do anything if an empty set is provided
            if not queries:
                return

        # Open the compressed database, either copied version or original file.
        if self.dbfile:
            fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
        else:
            assert (len(self.origfiles) == 1)
            try:
                fileobj = open_compressed(self.origfiles[0])
            except IOError as e:
                print "Skipping %s: %s" % (self.origfiles[0], e.strerror)
                return
            self._skip_header(fileobj)
        # if self.debug:
        #     print "Reading %s..." % self.listname

        # Locate seek positions for all queries
        if queries and self.indexname:  # Use index
            locs = list(
                _find_seeks_index(self.dbfile,
                                  self.indexname,
                                  queries,
                                  debug=self.debug))
        elif queries:  # Use bookmarks
            locs = list(
                _find_seeks_bookmarks(fileobj, queries, debug=self.debug))
        else:
            locs = [(None, None, 1)]  # Dummy values to start loop

        # Read selected lines from the file
        timer = Timer()
        loc = 0
        for startloc, endloc, nresults in locs:
            # Skip to the correct position in the file
            if queries:
                if startloc > loc:
                    #print "  Seek to", startloc
                    fileobj.seek(startloc)
                    loc = fileobj.tell()
                elif startloc < loc:
                    #print "  Skipping", startloc, "already at", loc
                    continue
                #else:
                #    print "  Skipping", startloc, "already there"
                #print "    Finish at", endloc, "after", nresults, "results"
            for _ in xrange(nresults):
                # Parse the file until we get a result
                for i, line in enumerate(fileobj):
                    # Determine if we have reached the end location for this
                    # section
                    if endloc and loc == endloc:
                        break
                    #assert(not endloc or loc < endloc)

                    # Do not index video games or individual TV episodes
                    # (Not applicable for all file types)
                    if not self.dbfile and self.skip_tvvg and \
                            ('(VG)' in line or '{' in line):
                        #loc = fileobj.tell() # Don't seek/tell in gzip
                        continue
                    # Decode database (IMDb databases use ISO-8859-1)
                    line = line.rstrip().decode('iso-8859-1')

                    if queries and i % 100 == 0:
                        timer.step()

                    data = self._parse_line(line, loc)
                    if self.dbfile:
                        loc = fileobj.tell()

                    if data is None:
                        break  # End of database
                    if not data:
                        continue  # Skip this line

                    # Check if one of our queries matches
                    if queries is None or data[0] in queries:
                        yield self._make_result(data)
                        if queries is not None:
                            # queries.remove(data[0])
                            break

        # if self.debug:
        #     print 'Completed in', timer, 'seconds.'
        fileobj.close()