def exe_extract_feature(argv):
    window_path, doc_path, topic_path, judge_path, word_stat_path, out_path = argv;
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    global window_db, doc_db, word_stat, model_factory;
    window_db = bsddb.hashopen(window_path);
    doc_db = bsddb.hashopen(doc_path);
    word_stat = load_word_stat(word_stat_path);
    model_factory = DocumentModelFactory(word_stat);
    writer = open(out_path, 'w');

    global topic_chain, window_chain, doc_chain;
    topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory)]);
    window_chain = topic_chain;
    doc_chain = TextChain([TextTokenizer(word_tokenize),TextStopRemover(stop_path),  TextStemmer(stemmer), TextModeler(model_factory), WindowWorker(window_chain), DocumentTitleWorker(topic_chain)])

    global topic_id;
    topic_ids = judge_file.keys();
    for topic_id in topic_ids:
        if not topics.has_key(topic_id):
            continue;
        topic_str = topics[topic_id];
        print topic_id;
        global topic;
        topic = TextPiece(topic_str);
        topic_chain.work(topic);

        p = Pool(task_num);
        lines_group = p.map(multithread_extract_feature, judge_file[topic_id].items());
        for lines in lines_group:
            for line in lines:
                writer.write(line);
                writer.write('\n');
    writer.close();
def exe_extract_windows(argv):
    topic_path, judge_path, text_db_path, windows_db_path = argv;
    text_db = bsddb.hashopen(text_db_path);
    window_db = bsddb.hashopen(windows_db_path, 'w');
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover('data/stoplist.dft'), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); 
    sentence_chain = TextChain([TextTokenizer(word_tokenize), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]);
    for topic_id, topic_str in topics.items():
        print topic_id;
        sys.stdout.flush();
        topic = TextPiece(topic_str);
        topic_chain.work(topic);
        if not judge_file.has_key(topic_id):
            continue;
        docnos = judge_file[topic_id].keys();
        for docno in docnos:
            if not is_cluewebB(docno):
                continue;
            doc_text = text_db[docno];
            window_candidates = match_window(topic, doc_text, sentence_chain);
            sentences = map(lambda text_piece: text_piece.text, window_candidates);
            text = '\n'.join(sentences);
            window_db[docno] = text.encode('utf8');
    window_db.close();
def exe_build_train(argv):
#1. create the workers;
    judge_path, topic_path, word_stat_path, doc_path, window_path, out_path = argv;
    global judge_file, topics, doc_db, window_db, word_stat, ranker;
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    doc_db = bsddb.hashopen(doc_path);
    window_db = bsddb.hashopen(window_path);
    word_stat = load_word_stat(word_stat_path);
#    aggregators = map(lambda k: Aggregator(k), K_options);
#    ranker = DistanceWindowRanker(CosTextScorer(), DocumentModelFactory(word_stat),aggregators);
    ranker = RetrievalWindowRanker(CosTextScorer(), DocumentModelFactory(word_stat));

#2. build the training data;
#    p = Pool(4);
    topic_ids = judge_file.keys();
#    docs_groups = p.map(build_train, topic_ids);
    docs_groups = map(build_train, topic_ids);
    assert len(docs_groups) == len(topic_ids);

#3. write out the training data
    writer = open(out_path, 'w');
    for i in xrange(len(topic_ids)):
        topic_id = topic_ids[i];
        docs = docs_groups[i];
        for doc in docs:
            docno = doc.docno;
            judge = judge_file[topic_id][docno];
            for scores, sentence_id in doc.score_windows:
                score_str = ','.join(map(str, scores));
                writer.write('%s %s %s %d %s\n' % (topic_id, docno, judge, sentence_id, score_str));    
    writer.close();
def test_extract_text(judge_path, index_path):
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    print 'doc number:', len(docnos);
    for docno in filter(is_cluewebB, docnos)[:3]:
        text = extract_text(docno, index_path);
        print text
        print '-' * 20
def test_extract_text(judge_path, index_path, collection_type):
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    print 'doc number:', len(docnos);
    for docno in docnos[:1]:
        text = extract_text(docno, index_path, collection_type);
        print text
        print '-' * 20
def exe_extract_text(judge_path, index_path, out_path, collection_type = 'html'):
    '''
        extract texts of docs in qrel from an index, and store them in out_path in standard trec format
    '''
    import Corpus
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    print 'doc number:', len(docnos);
    writer = Corpus.TRECWriter(out_path);
    for docno in docnos:
        text = extract_text(docno, index_path, collection_type)
        writer.write(Corpus.Document(docno, text))
def exe_extract_text(judge_path, index_path, text_db_path):
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    docnos = filter(is_cluewebB, docnos);
    #docnos = docnos[:1000];
    print 'doc number:', len(docnos);
    db = bsddb.hashopen(text_db_path, 'w');
    count = 0;
    texts = fastmap.fastmap(lambda docno: extract_text(docno, index_path), 30, docnos);
    assert len(docnos) == len(texts);
    for i in xrange(len(docnos)): 
        db[docnos[i]] = texts[i];
    db.close();
def exe_stat_window(qrel_path, window_db_path):
    window_db = bsddb.hashopen(window_db_path);
    qrel = QRelFile(qrel_path);
    sentence_nums = [];
    sentence_lens = [];
    for q in qrel.keys():
        for d in qrel.get(q).keys():
            if window_db.has_key(d):
                window = window_db[d];
                sentences = window.split('\n');
                sentence_nums.append(len(sentences));
                sentence_lens += map(lambda sentence: len(sentence.split()), sentences);
    print np.mean(sentence_nums), np.median(sentence_nums), np.mean(sentence_lens), np.median(sentence_lens);
Exemple #9
0
def exe_example(snippet_judge_path, doc_judge_path):
#def exe_example(snippet_judge_path, doc_judge_path, bing_path, sum_path, dsm_path):
    from JudgeFile import QRelFile;
    snippet_judge = load_snippet_judge(snippet_judge_path);
    doc_judge = QRelFile(doc_judge_path);
    sources = snippet_judge.keys();
    for topic_id in snippet_judge[sources[0]].keys():
        for docno in snippet_judge[sources[0]][topic_id]:
            in_rel = int(doc_judge.get_value(topic_id, docno));
            if in_rel <= 0:
                in_rel = 0;
            elif in_rel > 0:
                in_rel = 1;
            if snippet_judge['bing'].has_key(topic_id) and  snippet_judge['pablo.short'].has_key(topic_id) and  snippet_judge['windowshop.oq'].has_key(topic_id) :
                bing_per_rel = int(snippet_judge['bing'][topic_id][docno]);
                sum_per_rel = int(snippet_judge['pablo.short'][topic_id][docno]);
                oq_per_rel = int(snippet_judge['windowshop.oq'][topic_id][docno]);
                if (in_rel <> bing_per_rel or in_rel <> sum_per_rel) and in_rel == oq_per_rel == 1:
                    print topic_id, docno, in_rel, bing_per_rel, sum_per_rel, oq_per_rel;
import sys

from JudgeFile import QRelFile
from TRECTopics import StandardFormat
from TrainGenerator import *;


topics = StandardFormat().read(sys.argv[1]);
judge_file = QRelFile(sys.argv[2])
lemmas = set()
topic_ids = judge_file.keys()
for topic_id in topic_ids:
    if not topics.has_key(topic_id):
            continue
    topic_str = topics[topic_id]
    topic = TextPiece(topic_str)

    lemmas.update(topic.lemmas)

for lemma in lemmas:
    print lemma