print >> sys.stderr, '%s [stop word file] [output name] ' \ '[doc file] ...' % sys.argv[0] sys.exit(1) file_list = [] for _dir in sys.argv[3:]: collect_files(file_list, _dir) stop_word = Vocabulary() stop_word.load(sys.argv[1]) vocab = Vocabulary() articles = [] for filename in file_list: article = stem_file(filename, vocab, stop_word) articles.append(article) # random.shuffle(articles) vocab.sort() vocab.save(sys.argv[2] + '-vocab') fp = open(sys.argv[2] + '-train', 'w') for article in articles: sb = '' for word in article: sb += '%d ' % vocab.get_id_from_token(word) sb = sb.rstrip() fp.write(sb) fp.write('\n') fp.close()