def create_index(dbfile, dbdir, debug=False): """Index the movie list for searching.""" # Load ratings; number of ratings included in index for score weighting ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search() # Count word frequencies while outputting searchable list frequencies = Counter() #indexfh = ChunkedFile(dbfile, 'index', mode='a') indexfh = open_compressed(dbfile+'.idx', mode='w') # Index all IMDb titles skipped = 0 for iterator in \ (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(), parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()): last_time = None for obj in iterator: if len(obj) == 1: # movies.list.gz data = parsers.parse_title(obj[0]) akafor = '' else: # aka-titles.list.gz data = parsers.parse_title(obj[1]) # AKA name of the title akafor = obj[0] # Real name of the title # If it's a duplicate AKA (for indexing purposes), skip it. # The same AKA title may be repeated. For example: # (aka Die Hard 4.0 (2007)) (UK) # (aka Die Hard 4.0 (2007)) (Germany) if last_time and last_time[0:2] == obj[0:2]: skipped += 1 continue last_time = obj searchable = _clean_word(data.name).split(' ') # Save word frequencies frequencies.update(searchable) # Determine rating for result ranking nratings = 0 if akafor and akafor in ratings: nratings = ratings[akafor].nratings elif not akafor and data.title in ratings: nratings = ratings[data.title].nratings # Write movie to output indexfh.write("\t".join((''.join(searchable), data.year.encode('ascii') if data.year else '', data.title.encode('utf-8'), akafor.encode('utf-8'), str(nratings)))) indexfh.write("\n") indexfh.close() #print "Skipped %d duplicate AKA titles" % skipped # Write frequencies to stopwords file if False: swf = ChunkedFile(dbfile, 'stopwords', mode='a') for word, numtimes in frequencies.most_common(): swf.write("%s %d\n" % (word, numtimes)) swf.close()
def create_index(dbfile, dbdir, debug=False): """Index the movie list for searching.""" # Load ratings; number of ratings included in index for score weighting ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search() # Count word frequencies while outputting searchable list frequencies = Counter() #indexfh = ChunkedFile(dbfile, 'index', mode='a') indexfh = open_compressed(dbfile + '.idx', mode='w') # Index all IMDb titles skipped = 0 for iterator in \ (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(), parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()): last_time = None for obj in iterator: if len(obj) == 1: # movies.list.gz data = parsers.parse_title(obj[0]) akafor = '' else: # aka-titles.list.gz data = parsers.parse_title(obj[1]) # AKA name of the title akafor = obj[0] # Real name of the title # If it's a duplicate AKA (for indexing purposes), skip it. # The same AKA title may be repeated. For example: # (aka Die Hard 4.0 (2007)) (UK) # (aka Die Hard 4.0 (2007)) (Germany) if last_time and last_time[0:2] == obj[0:2]: skipped += 1 continue last_time = obj searchable = _clean_word(data.name).split(' ') # Save word frequencies frequencies.update(searchable) # Determine rating for result ranking nratings = 0 if akafor and akafor in ratings: nratings = ratings[akafor].nratings elif not akafor and data.title in ratings: nratings = ratings[data.title].nratings # Write movie to output indexfh.write("\t".join( (''.join(searchable), data.year.encode('ascii') if data.year else '', data.title.encode('utf-8'), akafor.encode('utf-8'), str(nratings)))) indexfh.write("\n") indexfh.close() #print "Skipped %d duplicate AKA titles" % skipped # Write frequencies to stopwords file if False: swf = ChunkedFile(dbfile, 'stopwords', mode='a') for word, numtimes in frequencies.most_common(): swf.write("%s %d\n" % (word, numtimes)) swf.close()
def rebuild_index(self, do_copy=True): """Create an index for this file, to allow rapid seeking to information about a given title.""" if do_copy: copy_to = ChunkedFile(self.dbfile, self.listname, mode='a', autoflush=True if self.indexname else False) tellobj = copy_to filenames = self.origfiles else: #filenames = ??? copy_to = None raise NotImplementedError indexobj = defaultdict(list) for filename in filenames: if do_copy: try: fileobj = open_compressed(filename) except IOError as e: print " Skipping %s: %s" % (filename, e.strerror) continue else: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') tellobj = fileobj self._skip_header(fileobj) # Get location of this line loc = tellobj.tell() for line in fileobj: # Do not index video games or individual TV episodes # (Not applicable for all file types) if self.skip_tvvg and ('(VG)' in line or '{' in line): continue if copy_to: copy_to.write(line) # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') data = self._parse_line(line, loc) loc = tellobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Add to the index title, idxline = data[0:2] #self._make_locator(data) title = title.encode('utf-8') if self.indexname: indexobj[title].append(idxline) elif copy_to: copy_to.bookmark(title) fileobj.close() if copy_to: copy_to.close() if self.indexname: # Write out a separate index, if required (e.g. names databases) indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a', autoflush=False) for title, linenos in sorted(indexobj.items()): indexfh.write(title) indexfh.write("\t") indexfh.write(' '.join(str(i) for i in linenos)) indexfh.write("\n") indexfh.bookmark(title) indexfh.close() else: # An index is required to use more than one file, since the # resulting combination will not be sorted assert(len(filenames) == 1)
def rebuild_index(self, do_copy=True): """Create an index for this file, to allow rapid seeking to information about a given title.""" if do_copy: copy_to = ChunkedFile(self.dbfile, self.listname, mode='a', autoflush=True if self.indexname else False) tellobj = copy_to filenames = self.origfiles else: #filenames = ??? copy_to = None raise NotImplementedError indexobj = defaultdict(list) for filename in filenames: if do_copy: try: fileobj = open_compressed(filename) except IOError as e: print " Skipping %s: %s" % (filename, e.strerror) continue else: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') tellobj = fileobj self._skip_header(fileobj) # Get location of this line loc = tellobj.tell() for line in fileobj: # Do not index video games or individual TV episodes # (Not applicable for all file types) if self.skip_tvvg and ('(VG)' in line or '{' in line): continue if copy_to: copy_to.write(line) # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') data = self._parse_line(line, loc) loc = tellobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Add to the index title, idxline = data[0:2] #self._make_locator(data) title = title.encode('utf-8') if self.indexname: indexobj[title].append(idxline) elif copy_to: copy_to.bookmark(title) fileobj.close() if copy_to: copy_to.close() if self.indexname: # Write out a separate index, if required (e.g. names databases) indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a', autoflush=False) for title, linenos in sorted(indexobj.items()): indexfh.write(title) indexfh.write("\t") indexfh.write(' '.join(str(i) for i in linenos)) indexfh.write("\n") indexfh.bookmark(title) indexfh.close() else: # An index is required to use more than one file, since the # resulting combination will not be sorted assert (len(filenames) == 1)