Example #1
0
def exe_extract_feature(argv):
    window_path, doc_path, topic_path, judge_path, word_stat_path, out_path = argv;
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    global window_db, doc_db, word_stat, model_factory;
    window_db = bsddb.hashopen(window_path);
    doc_db = bsddb.hashopen(doc_path);
    word_stat = load_word_stat(word_stat_path);
    model_factory = DocumentModelFactory(word_stat);
    writer = open(out_path, 'w');

    global topic_chain, window_chain, doc_chain;
    topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory)]);
    window_chain = topic_chain;
    doc_chain = TextChain([TextTokenizer(word_tokenize),TextStopRemover(stop_path),  TextStemmer(stemmer), TextModeler(model_factory), WindowWorker(window_chain), DocumentTitleWorker(topic_chain)])

    global topic_id;
    topic_ids = judge_file.keys();
    for topic_id in topic_ids:
        if not topics.has_key(topic_id):
            continue;
        topic_str = topics[topic_id];
        print topic_id;
        global topic;
        topic = TextPiece(topic_str);
        topic_chain.work(topic);

        p = Pool(task_num);
        lines_group = p.map(multithread_extract_feature, judge_file[topic_id].items());
        for lines in lines_group:
            for line in lines:
                writer.write(line);
                writer.write('\n');
    writer.close();
import sys

from JudgeFile import QRelFile
from TRECTopics import StandardFormat
from TrainGenerator import *;


topics = StandardFormat().read(sys.argv[1]);
judge_file = QRelFile(sys.argv[2])
lemmas = set()
topic_ids = judge_file.keys()
for topic_id in topic_ids:
    if not topics.has_key(topic_id):
            continue
    topic_str = topics[topic_id]
    topic = TextPiece(topic_str)

    lemmas.update(topic.lemmas)

for lemma in lemmas:
    print lemma