def exe_extract_feature(argv): window_path, doc_path, topic_path, judge_path, word_stat_path, out_path = argv; judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); global window_db, doc_db, word_stat, model_factory; window_db = bsddb.hashopen(window_path); doc_db = bsddb.hashopen(doc_path); word_stat = load_word_stat(word_stat_path); model_factory = DocumentModelFactory(word_stat); writer = open(out_path, 'w'); global topic_chain, window_chain, doc_chain; topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory)]); window_chain = topic_chain; doc_chain = TextChain([TextTokenizer(word_tokenize),TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory), WindowWorker(window_chain), DocumentTitleWorker(topic_chain)]) global topic_id; topic_ids = judge_file.keys(); for topic_id in topic_ids: if not topics.has_key(topic_id): continue; topic_str = topics[topic_id]; print topic_id; global topic; topic = TextPiece(topic_str); topic_chain.work(topic); p = Pool(task_num); lines_group = p.map(multithread_extract_feature, judge_file[topic_id].items()); for lines in lines_group: for line in lines: writer.write(line); writer.write('\n'); writer.close();
import sys from JudgeFile import QRelFile from TRECTopics import StandardFormat from TrainGenerator import *; topics = StandardFormat().read(sys.argv[1]); judge_file = QRelFile(sys.argv[2]) lemmas = set() topic_ids = judge_file.keys() for topic_id in topic_ids: if not topics.has_key(topic_id): continue topic_str = topics[topic_id] topic = TextPiece(topic_str) lemmas.update(topic.lemmas) for lemma in lemmas: print lemma