def _find_seeks_index(dbfile, indexname, queries, debug=False): """Use the index file to find exact seek positions for relevant records. End locations are not necessary since we are guaranteed that the data will be present, so a number of occurances is sufficient for prompt termination.""" timer = Timer(rl_min_dur=1) locs = Counter() if debug: print " Searching index..." indexfh = ChunkedFile(dbfile, indexname, mode='r') last_bookmark = 0 for query in sorted(queries): # Use bookmarks to rapidly search the index! bookmark = indexfh.find_bookmark(query.encode('utf-8')) if bookmark != last_bookmark: indexfh.seek(bookmark) #print " Seek to", bookmark last_bookmark = bookmark for i, line in enumerate(indexfh): title, nums = line.decode('utf-8').split('\t') if i % 100 == 0: timer.step() if title in queries: locs.update(int(x) for x in nums.split(' ')) elif title > query: break # This works because the index is sorted. indexfh.close() for start, nresults in sorted(locs.items()): yield (start, None, nresults) if debug: print ' Completed in', timer, 'seconds.'
def _find_seeks_index(dbfile, indexname, queries, debug=False): """Use the index file to find exact seek positions for relevant records. End locations are not necessary since we are guaranteed that the data will be present, so a number of occurances is sufficient for prompt termination.""" timer = Timer(rl_min_dur=1) locs = Counter() # if debug: # # print " Searching index..." # print "" indexfh = ChunkedFile(dbfile, indexname, mode='r') last_bookmark = 0 for query in sorted(queries): # Use bookmarks to rapidly search the index! bookmark = indexfh.find_bookmark(query.encode('utf-8')) if bookmark != last_bookmark: indexfh.seek(bookmark) #print " Seek to", bookmark last_bookmark = bookmark for i, line in enumerate(indexfh): title, nums = line.decode('utf-8').split('\t') if i % 100 == 0: timer.step() if title in queries: locs.update(int(x) for x in nums.split(' ')) elif title > query: break # This works because the index is sorted. indexfh.close() for start, nresults in sorted(locs.items()): yield (start, None, nresults)
def _run_search(self, queries): """Return items from the data file matching any item in queries.""" if queries is not None: queries = set(queries) # Don't do anything if an empty set is provided if not queries: return # Open the compressed database, either copied version or original file. if self.dbfile: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') else: assert(len(self.origfiles) == 1) try: fileobj = open_compressed(self.origfiles[0]) except IOError as e: print "Skipping %s: %s" % (self.origfiles[0], e.strerror) return self._skip_header(fileobj) if self.debug: print "Reading %s..." % self.listname # Locate seek positions for all queries if queries and self.indexname: # Use index locs = list(_find_seeks_index(self.dbfile, self.indexname, queries, debug=self.debug)) elif queries: # Use bookmarks locs = list(_find_seeks_bookmarks(fileobj, queries, debug=self.debug)) else: locs = [(None, None, 1)] # Dummy values to start loop # Read selected lines from the file timer = Timer() loc = 0 for startloc, endloc, nresults in locs: # Skip to the correct position in the file if queries: if startloc > loc: #print " Seek to", startloc fileobj.seek(startloc) loc = fileobj.tell() elif startloc < loc: #print " Skipping", startloc, "already at", loc continue #else: # print " Skipping", startloc, "already there" #print " Finish at", endloc, "after", nresults, "results" for _ in xrange(nresults): # Parse the file until we get a result for i, line in enumerate(fileobj): # Determine if we have reached the end location for this # section if endloc and loc == endloc: break #assert(not endloc or loc < endloc) # Do not index video games or individual TV episodes # (Not applicable for all file types) if not self.dbfile and self.skip_tvvg and \ ('(VG)' in line or '{' in line): #loc = fileobj.tell() # Don't seek/tell in gzip continue # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') if queries and i % 100 == 0: timer.step() data = self._parse_line(line, loc) if self.dbfile: loc = fileobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Check if one of our queries matches if queries is None or data[0] in queries: yield self._make_result(data) if queries is not None: # queries.remove(data[0]) break if self.debug: print 'Completed in', timer, 'seconds.' fileobj.close()
def _run_search(self, queries): """Return items from the data file matching any item in queries.""" if queries is not None: queries = set(queries) # Don't do anything if an empty set is provided if not queries: return # Open the compressed database, either copied version or original file. if self.dbfile: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') else: assert (len(self.origfiles) == 1) try: fileobj = open_compressed(self.origfiles[0]) except IOError as e: print "Skipping %s: %s" % (self.origfiles[0], e.strerror) return self._skip_header(fileobj) # if self.debug: # print "Reading %s..." % self.listname # Locate seek positions for all queries if queries and self.indexname: # Use index locs = list( _find_seeks_index(self.dbfile, self.indexname, queries, debug=self.debug)) elif queries: # Use bookmarks locs = list( _find_seeks_bookmarks(fileobj, queries, debug=self.debug)) else: locs = [(None, None, 1)] # Dummy values to start loop # Read selected lines from the file timer = Timer() loc = 0 for startloc, endloc, nresults in locs: # Skip to the correct position in the file if queries: if startloc > loc: #print " Seek to", startloc fileobj.seek(startloc) loc = fileobj.tell() elif startloc < loc: #print " Skipping", startloc, "already at", loc continue #else: # print " Skipping", startloc, "already there" #print " Finish at", endloc, "after", nresults, "results" for _ in xrange(nresults): # Parse the file until we get a result for i, line in enumerate(fileobj): # Determine if we have reached the end location for this # section if endloc and loc == endloc: break #assert(not endloc or loc < endloc) # Do not index video games or individual TV episodes # (Not applicable for all file types) if not self.dbfile and self.skip_tvvg and \ ('(VG)' in line or '{' in line): #loc = fileobj.tell() # Don't seek/tell in gzip continue # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') if queries and i % 100 == 0: timer.step() data = self._parse_line(line, loc) if self.dbfile: loc = fileobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Check if one of our queries matches if queries is None or data[0] in queries: yield self._make_result(data) if queries is not None: # queries.remove(data[0]) break # if self.debug: # print 'Completed in', timer, 'seconds.' fileobj.close()