コード例 #1
0
ファイル: gen_yahoo.py プロジェクト: zhangyafeikimi/ml-pack
        print >> sys.stderr, '%s [stop word file] [output name] ' \
                             '[doc file] ...' % sys.argv[0]
        sys.exit(1)

    file_list = []
    for _dir in sys.argv[3:]:
        collect_files(file_list, _dir)

    stop_word = Vocabulary()
    stop_word.load(sys.argv[1])
    vocab = Vocabulary()
    articles = []

    for filename in file_list:
        article = stem_file(filename, vocab, stop_word)
        articles.append(article)
    # random.shuffle(articles)

    vocab.sort()
    vocab.save(sys.argv[2] + '-vocab')

    fp = open(sys.argv[2] + '-train', 'w')
    for article in articles:
        sb = ''
        for word in article:
            sb += '%d ' % vocab.get_id_from_token(word)
        sb = sb.rstrip()
        fp.write(sb)
        fp.write('\n')
    fp.close()
コード例 #2
0
ファイル: lda_doc_proc.py プロジェクト: kurff/ml-lda
                    count = word_count.get(stemmed_word, 0) + 1
                    word_count[stemmed_word] = count
    infile.close()
    return word_count

if __name__ == '__main__':
    if len(sys.argv) <= 2:
        print >>sys.stderr, '%s [stop word file] [doc file] ...' % sys.argv[0]
        sys.exit(1)

    stop_word = Vocabulary()
    stop_word.load(sys.argv[1])
    vocab = Vocabulary()
    word_count_list = []

    for filename in sys.argv[2:]:
        word_count = stem_file(filename, vocab, stop_word)
        word_count_list.append(word_count)
    vocab.sort()
    vocab.save('train.vocab')

    fp = open('train', 'w')
    for word_count in word_count_list:
        for word in word_count.keys():
            id = vocab.get_id_from_token(word)
            count = word_count[word]
            fp.write('%d:%d ' % (id, count))
        fp.write('\n')
    fp.close()