def test_extract_text(judge_path, index_path): judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); print 'doc number:', len(docnos); for docno in filter(is_cluewebB, docnos)[:3]: text = extract_text(docno, index_path); print text print '-' * 20
def test_extract_text(judge_path, index_path, collection_type): judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); print 'doc number:', len(docnos); for docno in docnos[:1]: text = extract_text(docno, index_path, collection_type); print text print '-' * 20
def exe_extract_text(judge_path, index_path, out_path, collection_type = 'html'): ''' extract texts of docs in qrel from an index, and store them in out_path in standard trec format ''' import Corpus judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); print 'doc number:', len(docnos); writer = Corpus.TRECWriter(out_path); for docno in docnos: text = extract_text(docno, index_path, collection_type) writer.write(Corpus.Document(docno, text))
def exe_extract_text(judge_path, index_path, text_db_path): judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); docnos = filter(is_cluewebB, docnos); #docnos = docnos[:1000]; print 'doc number:', len(docnos); db = bsddb.hashopen(text_db_path, 'w'); count = 0; texts = fastmap.fastmap(lambda docno: extract_text(docno, index_path), 30, docnos); assert len(docnos) == len(texts); for i in xrange(len(docnos)): db[docnos[i]] = texts[i]; db.close();