def exe_extract_windows(argv): topic_path, judge_path, text_db_path, windows_db_path = argv; text_db = bsddb.hashopen(text_db_path); window_db = bsddb.hashopen(windows_db_path, 'w'); judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover('data/stoplist.dft'), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); sentence_chain = TextChain([TextTokenizer(word_tokenize), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); for topic_id, topic_str in topics.items(): print topic_id; sys.stdout.flush(); topic = TextPiece(topic_str); topic_chain.work(topic); if not judge_file.has_key(topic_id): continue; docnos = judge_file[topic_id].keys(); for docno in docnos: if not is_cluewebB(docno): continue; doc_text = text_db[docno]; window_candidates = match_window(topic, doc_text, sentence_chain); sentences = map(lambda text_piece: text_piece.text, window_candidates); text = '\n'.join(sentences); window_db[docno] = text.encode('utf8'); window_db.close();
def exe_extract_words(article_path, topic_path, out_word_path): term_set = set() topics = StandardFormat().read(topic_path) for topic_id, topic_string in topics.items(): topic = complete_text_work(topic_string) for token in topic.tokens: term_set.add(token) reader = TRECReader() reader.open(article_path) doc = reader.next() while doc: print doc.ID, len(term_set) text = complete_text_work(doc.text) for token in text.tokens: term_set.add(token) doc = reader.next() print 'writing.....' word_list_file = open(out_word_path, 'w'); words = list(term_set) words.sort(); map(lambda word:word_list_file.write('%s\n' % word), words); word_list_file.close();