def exe_extract_windows(argv):
    topic_path, judge_path, text_db_path, windows_db_path = argv;
    text_db = bsddb.hashopen(text_db_path);
    window_db = bsddb.hashopen(windows_db_path, 'w');
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover('data/stoplist.dft'), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); 
    sentence_chain = TextChain([TextTokenizer(word_tokenize), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]);
    for topic_id, topic_str in topics.items():
        print topic_id;
        sys.stdout.flush();
        topic = TextPiece(topic_str);
        topic_chain.work(topic);
        if not judge_file.has_key(topic_id):
            continue;
        docnos = judge_file[topic_id].keys();
        for docno in docnos:
            if not is_cluewebB(docno):
                continue;
            doc_text = text_db[docno];
            window_candidates = match_window(topic, doc_text, sentence_chain);
            sentences = map(lambda text_piece: text_piece.text, window_candidates);
            text = '\n'.join(sentences);
            window_db[docno] = text.encode('utf8');
    window_db.close();
Beispiel #2
0
def exe_extract_words(article_path, topic_path, out_word_path):
    term_set = set()

    topics = StandardFormat().read(topic_path)
    for topic_id, topic_string in topics.items():
        topic = complete_text_work(topic_string)
        for token in topic.tokens:
            term_set.add(token)

    reader = TRECReader()
    reader.open(article_path)
    doc = reader.next()
    while doc:
        print doc.ID, len(term_set)
        text = complete_text_work(doc.text)
        for token in text.tokens:
            term_set.add(token)
        doc = reader.next()

    print 'writing.....'
    word_list_file = open(out_word_path, 'w');
    words = list(term_set)
    words.sort();
    map(lambda word:word_list_file.write('%s\n' % word), words);
    word_list_file.close();
def exe_extract_feature(argv):
    window_path, doc_path, topic_path, judge_path, word_stat_path, out_path = argv;
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    global window_db, doc_db, word_stat, model_factory;
    window_db = bsddb.hashopen(window_path);
    doc_db = bsddb.hashopen(doc_path);
    word_stat = load_word_stat(word_stat_path);
    model_factory = DocumentModelFactory(word_stat);
    writer = open(out_path, 'w');

    global topic_chain, window_chain, doc_chain;
    topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory)]);
    window_chain = topic_chain;
    doc_chain = TextChain([TextTokenizer(word_tokenize),TextStopRemover(stop_path),  TextStemmer(stemmer), TextModeler(model_factory), WindowWorker(window_chain), DocumentTitleWorker(topic_chain)])

    global topic_id;
    topic_ids = judge_file.keys();
    for topic_id in topic_ids:
        if not topics.has_key(topic_id):
            continue;
        topic_str = topics[topic_id];
        print topic_id;
        global topic;
        topic = TextPiece(topic_str);
        topic_chain.work(topic);

        p = Pool(task_num);
        lines_group = p.map(multithread_extract_feature, judge_file[topic_id].items());
        for lines in lines_group:
            for line in lines:
                writer.write(line);
                writer.write('\n');
    writer.close();
def exe_extract_topic_words(argv):
    from nltk.tokenize import word_tokenize;
    topic_path, word_list_path = argv;
    trec_format = StandardFormat();
    word_list_file = open(word_list_path, 'w');    
    topics = trec_format.read(topic_path);
    word_set = set();
    for topic_id, topic_text in topics.items():
        words = map(lambda word: word.lower(), word_tokenize(topic_text));
        word_set.update(words);
    word_list_file.write('\n'.join(word_set));
    word_list_file.close();
import sys

from JudgeFile import QRelFile
from TRECTopics import StandardFormat
from TrainGenerator import *;


topics = StandardFormat().read(sys.argv[1]);
judge_file = QRelFile(sys.argv[2])
lemmas = set()
topic_ids = judge_file.keys()
for topic_id in topic_ids:
    if not topics.has_key(topic_id):
            continue
    topic_str = topics[topic_id]
    topic = TextPiece(topic_str)

    lemmas.update(topic.lemmas)

for lemma in lemmas:
    print lemma