def main(option):
    rootpath = option.rootpath
    collection = option.collection
    threshold = option.threshold
    text_style = option.text_style

    vocab_file = os.path.join(rootpath, collection, 'TextData', 'vocabulary',
                              text_style, 'word_vocab_%d.pkl' % threshold)
    counter_file = os.path.join(os.path.dirname(vocab_file),
                                'word_vocab_counter_%s.txt' % threshold)

    if checkToSkip(vocab_file, option.overwrite):
        sys.exit(0)
    makedirsforfile(vocab_file)

    vocab, word_counter = build_vocab(collection,
                                      text_style,
                                      threshold=threshold,
                                      rootpath=rootpath)
    with open(vocab_file, 'wb') as writer:
        pickle.dump(vocab, writer, pickle.HIGHEST_PROTOCOL)
    logger.info("Saved vocabulary file to %s", vocab_file)

    word_counter = [(word, cnt) for word, cnt in word_counter.items()
                    if cnt >= threshold]
    word_counter.sort(key=lambda x: x[1], reverse=True)
    with open(counter_file, 'w') as writer:
        writer.write('\n'.join(map(lambda x: x[0] + ' %d' % x[1],
                                   word_counter)))
    logger.info("Saved vocabulary counter file to %s", counter_file)
Ejemplo n.º 2
0
    def __init__(self, datafile, ndims=0, L1_norm=0, L2_norm=0):
        logger.info(INFO + '.' + self.__class__.__name__ + ' initializing ...')
        self.datafile = datafile
        self.nidms = ndims
        self.L1_norm = L1_norm
        self.L2_norm = L2_norm

        assert (L1_norm + L2_norm) <= 1