コード例 #1
0
ファイル: gen_yahoo.py プロジェクト: zhangyafeikimi/ml-pack
        print >> sys.stderr, '%s [stop word file] [output name] ' \
                             '[doc file] ...' % sys.argv[0]
        sys.exit(1)

    file_list = []
    for _dir in sys.argv[3:]:
        collect_files(file_list, _dir)

    stop_word = Vocabulary()
    stop_word.load(sys.argv[1])
    vocab = Vocabulary()
    articles = []

    for filename in file_list:
        article = stem_file(filename, vocab, stop_word)
        articles.append(article)
    # random.shuffle(articles)

    vocab.sort()
    vocab.save(sys.argv[2] + '-vocab')

    fp = open(sys.argv[2] + '-train', 'w')
    for article in articles:
        sb = ''
        for word in article:
            sb += '%d ' % vocab.get_id_from_token(word)
        sb = sb.rstrip()
        fp.write(sb)
        fp.write('\n')
    fp.close()