def main(option): rootpath = option.rootpath collection = option.collection threshold = option.threshold text_style = option.text_style vocab_file = os.path.join(rootpath, collection, 'TextData', 'vocabulary', text_style, 'word_vocab_%d.pkl' % threshold) counter_file = os.path.join(os.path.dirname(vocab_file), 'word_vocab_counter_%s.txt' % threshold) if checkToSkip(vocab_file, option.overwrite): sys.exit(0) makedirsforfile(vocab_file) vocab, word_counter = build_vocab(collection, text_style, threshold=threshold, rootpath=rootpath) with open(vocab_file, 'wb') as writer: pickle.dump(vocab, writer, pickle.HIGHEST_PROTOCOL) logger.info("Saved vocabulary file to %s", vocab_file) word_counter = [(word, cnt) for word, cnt in word_counter.items() if cnt >= threshold] word_counter.sort(key=lambda x: x[1], reverse=True) with open(counter_file, 'w') as writer: writer.write('\n'.join(map(lambda x: x[0] + ' %d' % x[1], word_counter))) logger.info("Saved vocabulary counter file to %s", counter_file)
def __init__(self, datafile, ndims=0, L1_norm=0, L2_norm=0): logger.info(INFO + '.' + self.__class__.__name__ + ' initializing ...') self.datafile = datafile self.nidms = ndims self.L1_norm = L1_norm self.L2_norm = L2_norm assert (L1_norm + L2_norm) <= 1