def exe_build_train(argv): #1. create the workers; judge_path, topic_path, word_stat_path, doc_path, window_path, out_path = argv; global judge_file, topics, doc_db, window_db, word_stat, ranker; judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); doc_db = bsddb.hashopen(doc_path); window_db = bsddb.hashopen(window_path); word_stat = load_word_stat(word_stat_path); # aggregators = map(lambda k: Aggregator(k), K_options); # ranker = DistanceWindowRanker(CosTextScorer(), DocumentModelFactory(word_stat),aggregators); ranker = RetrievalWindowRanker(CosTextScorer(), DocumentModelFactory(word_stat)); #2. build the training data; # p = Pool(4); topic_ids = judge_file.keys(); # docs_groups = p.map(build_train, topic_ids); docs_groups = map(build_train, topic_ids); assert len(docs_groups) == len(topic_ids); #3. write out the training data writer = open(out_path, 'w'); for i in xrange(len(topic_ids)): topic_id = topic_ids[i]; docs = docs_groups[i]; for doc in docs: docno = doc.docno; judge = judge_file[topic_id][docno]; for scores, sentence_id in doc.score_windows: score_str = ','.join(map(str, scores)); writer.write('%s %s %s %d %s\n' % (topic_id, docno, judge, sentence_id, score_str)); writer.close();
def exe_extract_feature(argv): window_path, doc_path, topic_path, judge_path, word_stat_path, out_path = argv; judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); global window_db, doc_db, word_stat, model_factory; window_db = bsddb.hashopen(window_path); doc_db = bsddb.hashopen(doc_path); word_stat = load_word_stat(word_stat_path); model_factory = DocumentModelFactory(word_stat); writer = open(out_path, 'w'); global topic_chain, window_chain, doc_chain; topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory)]); window_chain = topic_chain; doc_chain = TextChain([TextTokenizer(word_tokenize),TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory), WindowWorker(window_chain), DocumentTitleWorker(topic_chain)]) global topic_id; topic_ids = judge_file.keys(); for topic_id in topic_ids: if not topics.has_key(topic_id): continue; topic_str = topics[topic_id]; print topic_id; global topic; topic = TextPiece(topic_str); topic_chain.work(topic); p = Pool(task_num); lines_group = p.map(multithread_extract_feature, judge_file[topic_id].items()); for lines in lines_group: for line in lines: writer.write(line); writer.write('\n'); writer.close();
def exe_stat_window(qrel_path, window_db_path): window_db = bsddb.hashopen(window_db_path); qrel = QRelFile(qrel_path); sentence_nums = []; sentence_lens = []; for q in qrel.keys(): for d in qrel.get(q).keys(): if window_db.has_key(d): window = window_db[d]; sentences = window.split('\n'); sentence_nums.append(len(sentences)); sentence_lens += map(lambda sentence: len(sentence.split()), sentences); print np.mean(sentence_nums), np.median(sentence_nums), np.mean(sentence_lens), np.median(sentence_lens);
import sys from JudgeFile import QRelFile from TRECTopics import StandardFormat from TrainGenerator import *; topics = StandardFormat().read(sys.argv[1]); judge_file = QRelFile(sys.argv[2]) lemmas = set() topic_ids = judge_file.keys() for topic_id in topic_ids: if not topics.has_key(topic_id): continue topic_str = topics[topic_id] topic = TextPiece(topic_str) lemmas.update(topic.lemmas) for lemma in lemmas: print lemma