def exe_build_train(argv):
#1. create the workers;
    judge_path, topic_path, word_stat_path, doc_path, window_path, out_path = argv;
    global judge_file, topics, doc_db, window_db, word_stat, ranker;
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    doc_db = bsddb.hashopen(doc_path);
    window_db = bsddb.hashopen(window_path);
    word_stat = load_word_stat(word_stat_path);
#    aggregators = map(lambda k: Aggregator(k), K_options);
#    ranker = DistanceWindowRanker(CosTextScorer(), DocumentModelFactory(word_stat),aggregators);
    ranker = RetrievalWindowRanker(CosTextScorer(), DocumentModelFactory(word_stat));

#2. build the training data;
#    p = Pool(4);
    topic_ids = judge_file.keys();
#    docs_groups = p.map(build_train, topic_ids);
    docs_groups = map(build_train, topic_ids);
    assert len(docs_groups) == len(topic_ids);

#3. write out the training data
    writer = open(out_path, 'w');
    for i in xrange(len(topic_ids)):
        topic_id = topic_ids[i];
        docs = docs_groups[i];
        for doc in docs:
            docno = doc.docno;
            judge = judge_file[topic_id][docno];
            for scores, sentence_id in doc.score_windows:
                score_str = ','.join(map(str, scores));
                writer.write('%s %s %s %d %s\n' % (topic_id, docno, judge, sentence_id, score_str));    
    writer.close();
def exe_extract_feature(argv):
    window_path, doc_path, topic_path, judge_path, word_stat_path, out_path = argv;
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    global window_db, doc_db, word_stat, model_factory;
    window_db = bsddb.hashopen(window_path);
    doc_db = bsddb.hashopen(doc_path);
    word_stat = load_word_stat(word_stat_path);
    model_factory = DocumentModelFactory(word_stat);
    writer = open(out_path, 'w');

    global topic_chain, window_chain, doc_chain;
    topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory)]);
    window_chain = topic_chain;
    doc_chain = TextChain([TextTokenizer(word_tokenize),TextStopRemover(stop_path),  TextStemmer(stemmer), TextModeler(model_factory), WindowWorker(window_chain), DocumentTitleWorker(topic_chain)])

    global topic_id;
    topic_ids = judge_file.keys();
    for topic_id in topic_ids:
        if not topics.has_key(topic_id):
            continue;
        topic_str = topics[topic_id];
        print topic_id;
        global topic;
        topic = TextPiece(topic_str);
        topic_chain.work(topic);

        p = Pool(task_num);
        lines_group = p.map(multithread_extract_feature, judge_file[topic_id].items());
        for lines in lines_group:
            for line in lines:
                writer.write(line);
                writer.write('\n');
    writer.close();
def exe_stat_window(qrel_path, window_db_path):
    window_db = bsddb.hashopen(window_db_path);
    qrel = QRelFile(qrel_path);
    sentence_nums = [];
    sentence_lens = [];
    for q in qrel.keys():
        for d in qrel.get(q).keys():
            if window_db.has_key(d):
                window = window_db[d];
                sentences = window.split('\n');
                sentence_nums.append(len(sentences));
                sentence_lens += map(lambda sentence: len(sentence.split()), sentences);
    print np.mean(sentence_nums), np.median(sentence_nums), np.mean(sentence_lens), np.median(sentence_lens);
import sys

from JudgeFile import QRelFile
from TRECTopics import StandardFormat
from TrainGenerator import *;


topics = StandardFormat().read(sys.argv[1]);
judge_file = QRelFile(sys.argv[2])
lemmas = set()
topic_ids = judge_file.keys()
for topic_id in topic_ids:
    if not topics.has_key(topic_id):
            continue
    topic_str = topics[topic_id]
    topic = TextPiece(topic_str)

    lemmas.update(topic.lemmas)

for lemma in lemmas:
    print lemma