def create_index(dbfile, dbdir, debug=False): """Index the movie list for searching.""" # Load ratings; number of ratings included in index for score weighting ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search() # Count word frequencies while outputting searchable list frequencies = Counter() #indexfh = ChunkedFile(dbfile, 'index', mode='a') indexfh = open_compressed(dbfile+'.idx', mode='w') # Index all IMDb titles skipped = 0 for iterator in \ (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(), parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()): last_time = None for obj in iterator: if len(obj) == 1: # movies.list.gz data = parsers.parse_title(obj[0]) akafor = '' else: # aka-titles.list.gz data = parsers.parse_title(obj[1]) # AKA name of the title akafor = obj[0] # Real name of the title # If it's a duplicate AKA (for indexing purposes), skip it. # The same AKA title may be repeated. For example: # (aka Die Hard 4.0 (2007)) (UK) # (aka Die Hard 4.0 (2007)) (Germany) if last_time and last_time[0:2] == obj[0:2]: skipped += 1 continue last_time = obj searchable = _clean_word(data.name).split(' ') # Save word frequencies frequencies.update(searchable) # Determine rating for result ranking nratings = 0 if akafor and akafor in ratings: nratings = ratings[akafor].nratings elif not akafor and data.title in ratings: nratings = ratings[data.title].nratings # Write movie to output indexfh.write("\t".join((''.join(searchable), data.year.encode('ascii') if data.year else '', data.title.encode('utf-8'), akafor.encode('utf-8'), str(nratings)))) indexfh.write("\n") indexfh.close() #print "Skipped %d duplicate AKA titles" % skipped # Write frequencies to stopwords file if False: swf = ChunkedFile(dbfile, 'stopwords', mode='a') for word, numtimes in frequencies.most_common(): swf.write("%s %d\n" % (word, numtimes)) swf.close()
def _search_index(timer, dbfile, words, size, strip_stems=True, year=None, deltayear=8, debug=False): """Yield a subset of the database that somewhat matches words. Returns any movies that contains a subword of any of words. (See the _subwords function.) Shorter subwords means more results, but slower performance. words -- List of words. size -- Length of subwords to use for search. (See _subwords function.) strip_stems -- Omit really common subwords. (See _subwords function.) year -- A guess of the year. Only returns movies dated near year. deltayear -- Only return movies with year [year-deltayear,year+deltayear]. """ # Extract a plausible-looking subset of the database so we don't # have to run SequenceMatcher on everything. This works pretty # well, except for movies like O (2001). # A list of plain-text strings that we expect to find in the # SEARCHABLE field of the data. We will require at least one of # these to be present. wordlist = tuple(_subwords(_clean_words(words, strip_stems), size)) # If we are provided with an estimated year, compose a list of # acceptable years. validyears = range(year-deltayear, year+deltayear) if year else () if debug: print wordlist print "Searching..." # Reading lines out of a GzipFile is very slow; using gzip(1) is ~6.5x # faster. For further speedup, we could use zgrep(1) to extract our # subset using grep(1). #indexfh = ChunkedFile(dbfile, 'index') indexfh = open_compressed(dbfile+'.idx') #indexfh = open('idx.tmp') for i, line in enumerate(indexfh): # Quick check to determine if the entry matches any of our words # (grep -F is faster; grep -E might be faster still) for word in wordlist: if word in line: break else: continue # Get SEARCHABLE\tYEAR\tTITLE ryear, title, akafor, nratings = line.decode('utf-8').split('\t')[1:] # Check that the year is within tolerances if validyears and ryear and int(ryear) not in validyears: continue yield title, ryear, akafor, nratings if i % 100 == 0: timer.step() indexfh.close() if debug: print 'Completed search in', timer, 'seconds.'
def create_index(dbfile, dbdir, debug=False): """Index the movie list for searching.""" # Load ratings; number of ratings included in index for score weighting ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search() # Count word frequencies while outputting searchable list frequencies = Counter() #indexfh = ChunkedFile(dbfile, 'index', mode='a') indexfh = open_compressed(dbfile + '.idx', mode='w') # Index all IMDb titles skipped = 0 for iterator in \ (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(), parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()): last_time = None for obj in iterator: if len(obj) == 1: # movies.list.gz data = parsers.parse_title(obj[0]) akafor = '' else: # aka-titles.list.gz data = parsers.parse_title(obj[1]) # AKA name of the title akafor = obj[0] # Real name of the title # If it's a duplicate AKA (for indexing purposes), skip it. # The same AKA title may be repeated. For example: # (aka Die Hard 4.0 (2007)) (UK) # (aka Die Hard 4.0 (2007)) (Germany) if last_time and last_time[0:2] == obj[0:2]: skipped += 1 continue last_time = obj searchable = _clean_word(data.name).split(' ') # Save word frequencies frequencies.update(searchable) # Determine rating for result ranking nratings = 0 if akafor and akafor in ratings: nratings = ratings[akafor].nratings elif not akafor and data.title in ratings: nratings = ratings[data.title].nratings # Write movie to output indexfh.write("\t".join( (''.join(searchable), data.year.encode('ascii') if data.year else '', data.title.encode('utf-8'), akafor.encode('utf-8'), str(nratings)))) indexfh.write("\n") indexfh.close() #print "Skipped %d duplicate AKA titles" % skipped # Write frequencies to stopwords file if False: swf = ChunkedFile(dbfile, 'stopwords', mode='a') for word, numtimes in frequencies.most_common(): swf.write("%s %d\n" % (word, numtimes)) swf.close()
def _search_index(timer, dbfile, words, size, strip_stems=True, year=None, deltayear=8, debug=False): """Yield a subset of the database that somewhat matches words. Returns any movies that contains a subword of any of words. (See the _subwords function.) Shorter subwords means more results, but slower performance. words -- List of words. size -- Length of subwords to use for search. (See _subwords function.) strip_stems -- Omit really common subwords. (See _subwords function.) year -- A guess of the year. Only returns movies dated near year. deltayear -- Only return movies with year [year-deltayear,year+deltayear]. """ # Extract a plausible-looking subset of the database so we don't # have to run SequenceMatcher on everything. This works pretty # well, except for movies like O (2001). # A list of plain-text strings that we expect to find in the # SEARCHABLE field of the data. We will require at least one of # these to be present. wordlist = tuple(_subwords(_clean_words(words, strip_stems), size)) # If we are provided with an estimated year, compose a list of # acceptable years. validyears = range(year - deltayear, year + deltayear) if year else () if debug: print wordlist # print "Searching..." # Reading lines out of a GzipFile is very slow; using gzip(1) is ~6.5x # faster. For further speedup, we could use zgrep(1) to extract our # subset using grep(1). #indexfh = ChunkedFile(dbfile, 'index') indexfh = open_compressed(dbfile + '.idx') #indexfh = open('idx.tmp') for i, line in enumerate(indexfh): # Quick check to determine if the entry matches any of our words # (grep -F is faster; grep -E might be faster still) for word in wordlist: if word in line: break else: continue # Get SEARCHABLE\tYEAR\tTITLE ryear, title, akafor, nratings = line.decode('utf-8').split('\t')[1:] # Check that the year is within tolerances if validyears and ryear and int(ryear) not in validyears: continue yield title, ryear, akafor, nratings if i % 100 == 0: timer.step() indexfh.close()
def _run_search(self, queries): """Return items from the data file matching any item in queries.""" if queries is not None: queries = set(queries) # Don't do anything if an empty set is provided if not queries: return # Open the compressed database, either copied version or original file. if self.dbfile: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') else: assert(len(self.origfiles) == 1) try: fileobj = open_compressed(self.origfiles[0]) except IOError as e: print "Skipping %s: %s" % (self.origfiles[0], e.strerror) return self._skip_header(fileobj) if self.debug: print "Reading %s..." % self.listname # Locate seek positions for all queries if queries and self.indexname: # Use index locs = list(_find_seeks_index(self.dbfile, self.indexname, queries, debug=self.debug)) elif queries: # Use bookmarks locs = list(_find_seeks_bookmarks(fileobj, queries, debug=self.debug)) else: locs = [(None, None, 1)] # Dummy values to start loop # Read selected lines from the file timer = Timer() loc = 0 for startloc, endloc, nresults in locs: # Skip to the correct position in the file if queries: if startloc > loc: #print " Seek to", startloc fileobj.seek(startloc) loc = fileobj.tell() elif startloc < loc: #print " Skipping", startloc, "already at", loc continue #else: # print " Skipping", startloc, "already there" #print " Finish at", endloc, "after", nresults, "results" for _ in xrange(nresults): # Parse the file until we get a result for i, line in enumerate(fileobj): # Determine if we have reached the end location for this # section if endloc and loc == endloc: break #assert(not endloc or loc < endloc) # Do not index video games or individual TV episodes # (Not applicable for all file types) if not self.dbfile and self.skip_tvvg and \ ('(VG)' in line or '{' in line): #loc = fileobj.tell() # Don't seek/tell in gzip continue # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') if queries and i % 100 == 0: timer.step() data = self._parse_line(line, loc) if self.dbfile: loc = fileobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Check if one of our queries matches if queries is None or data[0] in queries: yield self._make_result(data) if queries is not None: # queries.remove(data[0]) break if self.debug: print 'Completed in', timer, 'seconds.' fileobj.close()
def rebuild_index(self, do_copy=True): """Create an index for this file, to allow rapid seeking to information about a given title.""" if do_copy: copy_to = ChunkedFile(self.dbfile, self.listname, mode='a', autoflush=True if self.indexname else False) tellobj = copy_to filenames = self.origfiles else: #filenames = ??? copy_to = None raise NotImplementedError indexobj = defaultdict(list) for filename in filenames: if do_copy: try: fileobj = open_compressed(filename) except IOError as e: print " Skipping %s: %s" % (filename, e.strerror) continue else: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') tellobj = fileobj self._skip_header(fileobj) # Get location of this line loc = tellobj.tell() for line in fileobj: # Do not index video games or individual TV episodes # (Not applicable for all file types) if self.skip_tvvg and ('(VG)' in line or '{' in line): continue if copy_to: copy_to.write(line) # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') data = self._parse_line(line, loc) loc = tellobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Add to the index title, idxline = data[0:2] #self._make_locator(data) title = title.encode('utf-8') if self.indexname: indexobj[title].append(idxline) elif copy_to: copy_to.bookmark(title) fileobj.close() if copy_to: copy_to.close() if self.indexname: # Write out a separate index, if required (e.g. names databases) indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a', autoflush=False) for title, linenos in sorted(indexobj.items()): indexfh.write(title) indexfh.write("\t") indexfh.write(' '.join(str(i) for i in linenos)) indexfh.write("\n") indexfh.bookmark(title) indexfh.close() else: # An index is required to use more than one file, since the # resulting combination will not be sorted assert(len(filenames) == 1)
def _run_search(self, queries): """Return items from the data file matching any item in queries.""" if queries is not None: queries = set(queries) # Don't do anything if an empty set is provided if not queries: return # Open the compressed database, either copied version or original file. if self.dbfile: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') else: assert (len(self.origfiles) == 1) try: fileobj = open_compressed(self.origfiles[0]) except IOError as e: print "Skipping %s: %s" % (self.origfiles[0], e.strerror) return self._skip_header(fileobj) # if self.debug: # print "Reading %s..." % self.listname # Locate seek positions for all queries if queries and self.indexname: # Use index locs = list( _find_seeks_index(self.dbfile, self.indexname, queries, debug=self.debug)) elif queries: # Use bookmarks locs = list( _find_seeks_bookmarks(fileobj, queries, debug=self.debug)) else: locs = [(None, None, 1)] # Dummy values to start loop # Read selected lines from the file timer = Timer() loc = 0 for startloc, endloc, nresults in locs: # Skip to the correct position in the file if queries: if startloc > loc: #print " Seek to", startloc fileobj.seek(startloc) loc = fileobj.tell() elif startloc < loc: #print " Skipping", startloc, "already at", loc continue #else: # print " Skipping", startloc, "already there" #print " Finish at", endloc, "after", nresults, "results" for _ in xrange(nresults): # Parse the file until we get a result for i, line in enumerate(fileobj): # Determine if we have reached the end location for this # section if endloc and loc == endloc: break #assert(not endloc or loc < endloc) # Do not index video games or individual TV episodes # (Not applicable for all file types) if not self.dbfile and self.skip_tvvg and \ ('(VG)' in line or '{' in line): #loc = fileobj.tell() # Don't seek/tell in gzip continue # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') if queries and i % 100 == 0: timer.step() data = self._parse_line(line, loc) if self.dbfile: loc = fileobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Check if one of our queries matches if queries is None or data[0] in queries: yield self._make_result(data) if queries is not None: # queries.remove(data[0]) break # if self.debug: # print 'Completed in', timer, 'seconds.' fileobj.close()
def rebuild_index(self, do_copy=True): """Create an index for this file, to allow rapid seeking to information about a given title.""" if do_copy: copy_to = ChunkedFile(self.dbfile, self.listname, mode='a', autoflush=True if self.indexname else False) tellobj = copy_to filenames = self.origfiles else: #filenames = ??? copy_to = None raise NotImplementedError indexobj = defaultdict(list) for filename in filenames: if do_copy: try: fileobj = open_compressed(filename) except IOError as e: print " Skipping %s: %s" % (filename, e.strerror) continue else: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') tellobj = fileobj self._skip_header(fileobj) # Get location of this line loc = tellobj.tell() for line in fileobj: # Do not index video games or individual TV episodes # (Not applicable for all file types) if self.skip_tvvg and ('(VG)' in line or '{' in line): continue if copy_to: copy_to.write(line) # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') data = self._parse_line(line, loc) loc = tellobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Add to the index title, idxline = data[0:2] #self._make_locator(data) title = title.encode('utf-8') if self.indexname: indexobj[title].append(idxline) elif copy_to: copy_to.bookmark(title) fileobj.close() if copy_to: copy_to.close() if self.indexname: # Write out a separate index, if required (e.g. names databases) indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a', autoflush=False) for title, linenos in sorted(indexobj.items()): indexfh.write(title) indexfh.write("\t") indexfh.write(' '.join(str(i) for i in linenos)) indexfh.write("\n") indexfh.bookmark(title) indexfh.close() else: # An index is required to use more than one file, since the # resulting combination will not be sorted assert (len(filenames) == 1)