Example #1
0
def exe_extract_topic_words(argv):
    from nltk.tokenize import word_tokenize;
    topic_path, word_list_path = argv;
    trec_format = StandardFormat();
    word_list_file = open(word_list_path, 'w');    
    topics = trec_format.read(topic_path);
    word_set = set();
    for topic_id, topic_text in topics.items():
        words = map(lambda word: word.lower(), word_tokenize(topic_text));
        word_set.update(words);
    word_list_file.write('\n'.join(word_set));
    word_list_file.close();