def exe_extract_windows(argv): topic_path, judge_path, text_db_path, windows_db_path = argv; text_db = bsddb.hashopen(text_db_path); window_db = bsddb.hashopen(windows_db_path, 'w'); judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover('data/stoplist.dft'), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); sentence_chain = TextChain([TextTokenizer(word_tokenize), TextStemmer(EnglishStemmer()), TextTokenNormalizer()]); for topic_id, topic_str in topics.items(): print topic_id; sys.stdout.flush(); topic = TextPiece(topic_str); topic_chain.work(topic); if not judge_file.has_key(topic_id): continue; docnos = judge_file[topic_id].keys(); for docno in docnos: if not is_cluewebB(docno): continue; doc_text = text_db[docno]; window_candidates = match_window(topic, doc_text, sentence_chain); sentences = map(lambda text_piece: text_piece.text, window_candidates); text = '\n'.join(sentences); window_db[docno] = text.encode('utf8'); window_db.close();
def exe_extract_words(article_path, topic_path, out_word_path): term_set = set() topics = StandardFormat().read(topic_path) for topic_id, topic_string in topics.items(): topic = complete_text_work(topic_string) for token in topic.tokens: term_set.add(token) reader = TRECReader() reader.open(article_path) doc = reader.next() while doc: print doc.ID, len(term_set) text = complete_text_work(doc.text) for token in text.tokens: term_set.add(token) doc = reader.next() print 'writing.....' word_list_file = open(out_word_path, 'w'); words = list(term_set) words.sort(); map(lambda word:word_list_file.write('%s\n' % word), words); word_list_file.close();
def exe_extract_feature(argv): window_path, doc_path, topic_path, judge_path, word_stat_path, out_path = argv; judge_file = QRelFile(judge_path); topics = StandardFormat().read(topic_path); global window_db, doc_db, word_stat, model_factory; window_db = bsddb.hashopen(window_path); doc_db = bsddb.hashopen(doc_path); word_stat = load_word_stat(word_stat_path); model_factory = DocumentModelFactory(word_stat); writer = open(out_path, 'w'); global topic_chain, window_chain, doc_chain; topic_chain = TextChain([TextTokenizer(word_tokenize), TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory)]); window_chain = topic_chain; doc_chain = TextChain([TextTokenizer(word_tokenize),TextStopRemover(stop_path), TextStemmer(stemmer), TextModeler(model_factory), WindowWorker(window_chain), DocumentTitleWorker(topic_chain)]) global topic_id; topic_ids = judge_file.keys(); for topic_id in topic_ids: if not topics.has_key(topic_id): continue; topic_str = topics[topic_id]; print topic_id; global topic; topic = TextPiece(topic_str); topic_chain.work(topic); p = Pool(task_num); lines_group = p.map(multithread_extract_feature, judge_file[topic_id].items()); for lines in lines_group: for line in lines: writer.write(line); writer.write('\n'); writer.close();
def exe_extract_topic_words(argv): from nltk.tokenize import word_tokenize; topic_path, word_list_path = argv; trec_format = StandardFormat(); word_list_file = open(word_list_path, 'w'); topics = trec_format.read(topic_path); word_set = set(); for topic_id, topic_text in topics.items(): words = map(lambda word: word.lower(), word_tokenize(topic_text)); word_set.update(words); word_list_file.write('\n'.join(word_set)); word_list_file.close();
import sys from JudgeFile import QRelFile from TRECTopics import StandardFormat from TrainGenerator import *; topics = StandardFormat().read(sys.argv[1]); judge_file = QRelFile(sys.argv[2]) lemmas = set() topic_ids = judge_file.keys() for topic_id in topic_ids: if not topics.has_key(topic_id): continue topic_str = topics[topic_id] topic = TextPiece(topic_str) lemmas.update(topic.lemmas) for lemma in lemmas: print lemma