class NGramIndex: """ This class maintains an index of information about n-grams using a table of sorted hashes. It has two modes of operation, depending on whether or not 'index_insert_word' is True or False: * If 'index_insert_word' is true, then for an n-gram, n-1 words are used as the 'context' i.e. a table is kept with rows: <context-ngram-hash> <context-ngram-count> <insert-word-id> where <insert-word> is the word that completes the n-gram, derived from the insert_word_posn in the ngram. * If 'index_insert_word' is False, then for an n-gram, all n words are used, i.e. a table is kept with rows: <ngram-hash> <ngram-count> The tables are sorted by the hash, and ngram information is retrieved by doing a binary search on the ngram hashes to find the appropriate row(s) with the other ngram info. This is approach is MUCH more memory efficient than an approach that uses a python dictionary (by a factor of about 10). """ PRINT_MOD = 100000 def __init__(self, ngram_file, insert_word_posn, index_insert_words, min_count): print "\nNGramIndex: Initializing a new index" self.index_insert_words = index_insert_words self.counts_total = 0 self.num_uniq_ngrams = 0 self.vocab = Vocabulary() print "Now examining ngram counts in", ngram_file max_lines = self.ngram_file_mincount_line(ngram_file, min_count) print "For a min ngram count of ", min_count, print "need to read", max_lines, "ngrams from", ngram_file self.ngram_hash = numpy.zeros(max_lines, dtype=numpy.int64) self.ngram_count = numpy.zeros(max_lines, dtype=numpy.int32) if index_insert_words: self.ngram_gapword = numpy.zeros(max_lines, dtype=numpy.int32) self.build_index(ngram_file, insert_word_posn, index_insert_words, max_lines, min_count) def ngram_file_mincount_line(self, fname, min_count): fin = open(fname) for line_num, line in enumerate(fin): # Each ngram file contains lines with the ngram_count & ngram tokens, # one ngram per line, with lines sorted in descending order by count. count = int(line.split()[0]) if count < min_count: break fin.close() return line_num def hash_words(self, word_list): return hash(' '.join(word_list)) def build_index(self, ngram_file, insert_word_posn, index_insert_words, max_lines, min_count): ix = 0 ngram_counts = self.read_ngram_counts( ngram_file, max_lines=max_lines, min_count=min_count ) for ngram, count in ngram_counts: if index_insert_words: assert 0 <= insert_word_posn < len(ngram)-1 # NOTE Slightly tricky indexing issue: By convention, # insert_word_position usually points at the word before # the assumed gap/deleted word. It can't point at the deleted # word directly, because the deleted word is assumed gone. # But when indexing ngrams that *do* include the # "insert word" that will be deleted, one *can* point to it; # it's in the (insert_word_posn+1) position. insert_word = ngram[ (insert_word_posn+1)] context_words = ngram[:(insert_word_posn+1)] + ngram[(insert_word_posn+1+1):] self.ngram_hash[ ix] = self.hash_words(context_words) self.ngram_count[ix] = count self.ngram_gapword[ix] = self.vocab.word_to_id(insert_word) # DEBUG prints if False: print ix, context_words, self.ngram_hash[ ix], '->', print insert_word, self.ngram_gapword[ix], print 'count:', self.ngram_count[ix] else: self.ngram_hash[ ix] = self.hash_words(ngram) self.ngram_count[ix] = count # DEBUG prints if False: print ix, ngram, self.ngram_hash[ ix], print 'count:', self.ngram_count[ix] ix += 1 self.counts_total += count self.num_uniq_ngrams += 1 print "Sorting the index by ngram hash" sort_order = self.ngram_hash.argsort() self.ngram_hash = self.ngram_hash[ sort_order] self.ngram_count = self.ngram_count[sort_order] if index_insert_words: self.ngram_gapword = self.ngram_gapword[sort_order] pass def read_ngram_counts(self, ngram_file, max_lines=None, min_count=None): print "Reading n-grams from:", ngram_file fin = open(ngram_file,'r') for line_num, line in enumerate(fin): tokens = line.rstrip().split() count = int(tokens[0]) ngram = tokens[1:] self.print_status(line_num) if max_lines and line_num >= max_lines - 1: print "\nStopped: reached file max_lines limit of:", max_lines print "Read", line_num,"lines." break if min_count and count < min_count: print "\nStopped: reached n-gram min_count limit of", min_count print "Read", line_num,"lines." break yield (ngram, count) fin.close() print self.counts_total, "total n-grams,", print self.num_uniq_ngrams, "unique", print "(%6.2f%%)" % (100.*self.num_uniq_ngrams/self.counts_total) def print_status(self, line_num): # during lengthy file read, print a '.' after every reading PRINT_MOD lines if line_num % NGramIndex.PRINT_MOD == 0: sys.stdout.write('.') sys.stdout.flush() # @profile # for line_profiler def insert_word_suggestions(self, context_words): assert self.index_insert_words == True suggestions = [] h = self.hash_words(context_words) ix = self.ngram_hash.searchsorted(h) while 0 <= ix < len(self.ngram_hash) and self.ngram_hash[ix] == h : word_id = self.ngram_gapword[ix] word = self.vocab.id_to_word(word_id) count = self.ngram_count[ix] suggestions.append((word, count)) ix += 1 return suggestions #def get_count_cased(self, ngram_words): def get_count(self, ngram_words): assert self.index_insert_words == False h = self.hash_words(ngram_words) ix = self.ngram_hash.searchsorted(h) found = 0 <= ix < len(self.ngram_hash) and self.ngram_hash[ix] == h if not found: return 0 else: return self.ngram_count[ix] """