Example #1
0
def exe_extract_windows(argv):
    topic_path, judge_path, text_db_path, windows_db_path = argv;
    text_db = bsddb.hashopen(text_db_path);
    window_db = bsddb.hashopen(windows_db_path, 'w');
    judge_file = QRelFile(judge_path);
    topics = StandardFormat().read(topic_path);
    topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover('data/stoplist.dft'), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); 
    sentence_chain = TextChain([TextTokenizer(word_tokenize), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]);
    for topic_id, topic_str in topics.items():
        print topic_id;
        sys.stdout.flush();
        topic = TextPiece(topic_str);
        topic_chain.work(topic);
        if not judge_file.has_key(topic_id):
            continue;
        docnos = judge_file[topic_id].keys();
        for docno in docnos:
            if not is_cluewebB(docno):
                continue;
            doc_text = text_db[docno];
            window_candidates = match_window(topic, doc_text, sentence_chain);
            sentences = map(lambda text_piece: text_piece.text, window_candidates);
            text = '\n'.join(sentences);
            window_db[docno] = text.encode('utf8');
    window_db.close();
Example #2
0
def exe_extract_words(article_path, topic_path, out_word_path):
    term_set = set()

    topics = StandardFormat().read(topic_path)
    for topic_id, topic_string in topics.items():
        topic = complete_text_work(topic_string)
        for token in topic.tokens:
            term_set.add(token)

    reader = TRECReader()
    reader.open(article_path)
    doc = reader.next()
    while doc:
        print doc.ID, len(term_set)
        text = complete_text_work(doc.text)
        for token in text.tokens:
            term_set.add(token)
        doc = reader.next()

    print 'writing.....'
    word_list_file = open(out_word_path, 'w');
    words = list(term_set)
    words.sort();
    map(lambda word:word_list_file.write('%s\n' % word), words);
    word_list_file.close();