class WordExtractor(object): def __init__(self, output_file, get_word_freq = None): self.get_word_freq = get_word_freq self.new_words = wordb.open(output_file) self.filters = Filters() self.n_killed = 0 self.n_added = 0 def __call__(self, words): self.process_words(words, threshold=2560000) def process_files(files): """process file in batch """ for fn in files: with codecs.open(fn, 'r', 'utf-8') as f: self.process_file(f) def process_file(self, input_file): """process segmented file """ words = set() for line in input_file: words.add(set(line.split(u'/'))) self.process_words(words) def process_words(self, words, threshold=30000): for word in words: if self.filters.keep(word) and \ word not in self.new_words: if self.get_word_freq: freq = self.get_word_freq(word) if freq > threshold: logging.info("%s\tadded into db" % word) self.new_words[word] = freq else: logging.info("%s\tadded into db" % word) self.new_words[word] = 1 self.n_added += 1 else: self.n_killed +=1