Example #1
0
class WordExtractor(object):

    def __init__(self, output_file, get_word_freq = None):
        self.get_word_freq = get_word_freq
        self.new_words = wordb.open(output_file)
        self.filters = Filters()
        self.n_killed = 0
        self.n_added = 0

    def __call__(self, words):
        self.process_words(words, threshold=2560000)
        
    def process_files(files):
        """process file in batch
        """
        for fn in files:
            with codecs.open(fn, 'r', 'utf-8') as f:
                self.process_file(f)

    def process_file(self, input_file):
        """process segmented file
        """
        words = set()
        for line in input_file:
            words.add(set(line.split(u'/')))
        self.process_words(words)

    def process_words(self, words, threshold=30000):
        for word in words:
            if self.filters.keep(word) and \
               word not in self.new_words:
                if self.get_word_freq:
                    freq = self.get_word_freq(word)
                    if freq > threshold:
                        logging.info("%s\tadded into db" % word)
                        self.new_words[word] = freq
                else:
                    logging.info("%s\tadded into db" % word)
                    self.new_words[word] = 1
                self.n_added += 1
            else:
                self.n_killed +=1