def is_visited(question_id): visited = True if question_id is not None: try: visited = (Session.query(Question).filter_by( question_id=question_id).count() != 0) except: logger.error('fail to query question_id %s', question_id, exc_info=True) return visited
def get_analyzed_result(question_text): if question_text is None: return None md5_string = md5(question_text) ltp_result = Session.query(LtpResult).filter_by(md5=md5_string).first() if ltp_result is not None: analyzed_result = AnalyzedSentence(md5_string, ltp_result.json_text) else: try: result_json = analyze(question_text) except RuntimeError: logger.error('fail to invoke ltp api, text=%s', question_text, exc_info=True) raise RuntimeError() save_analyzed_result(md5_string, result_json) analyzed_result = AnalyzedSentence(md5_string, result_json) return analyzed_result
def k_fold_cross_dataset(k, num): """Generate k-fold cross test set and train set. Example: In: k_fold_cross_dataset(2, 10) Out: [ { 'test_text': 'data/2-fold-cross-10-test-text-1.txt', 'test_label': 'data/2-fold-cross-10-test-label-1.txt', 'train_text': 'data/2-fold-cross-10-train-text-1.txt', 'train_label': 'data/2-fold-cross-10-train-label-1.txt', }, { 'test_text': 'data/2-fold-cross-10-test-text-2.txt', 'test_label': 'data/2-fold-cross-10-test-label-2.txt', 'train_text': 'data/2-fold-cross-10-train-text-2.txt', 'train_label': 'data/2-fold-cross-10-train-label-2.txt', } ] :param k: int :param num: int :return: list :raise RuntimeError: """ prefix = 'data/{k}-fold-cross-{num}'.format(k=k, num=num) file_pattern = '{prefix}-{{type}}-{{{{i}}}}.txt'.format(prefix=prefix) test_text_file_pattern = file_pattern.format(type='test-text') test_label_file_pattern = file_pattern.format(type='test-label') train_text_file_pattern = file_pattern.format(type='train-text') train_label_file_pattern = file_pattern.format(type='train-label') file_names = [] for i in range(0, k): test_text_file = test_text_file_pattern.format(i=i) test_label_file = test_label_file_pattern.format(i=i) train_text_file = train_text_file_pattern.format(i=i) train_label_file = train_label_file_pattern.format(i=i) file_names.append({ 'test_text': test_text_file, 'test_label': test_label_file, 'train_text': train_text_file, 'train_label': train_label_file}) exist = True for i in file_names: for file_ in i.itervalues(): if not os.path.isfile(file_): exist = False break if not exist: filtered_paragraphs = Session.query(FilteredParagraph).limit(num).all() if len(filtered_paragraphs) != num: raise RuntimeError() random.shuffle(filtered_paragraphs) folds = [[] for i in range(0, k)] for i in range(0, num): folds[i % k].append(filtered_paragraphs[i].paragraph) for i in range(0, k): test_text_file = file_names[i]['test_text'] test_label_file = file_names[i]['test_label'] train_text_file = file_names[i]['train_text'] train_label_file = file_names[i]['train_label'] test_set = folds[i] train_set = [] for j in range(0, k): if j != i: train_set.extend(folds[j]) # generate test set generate_dataset(test_set, test_text_file, test_label_file) # generate train set generate_dataset(train_set, train_text_file, train_label_file) return file_names
#!/usr/bin/env python # coding: utf-8 __author__ = 'wilfredwang' from data_access import Session from data_access import Paragraph import traceback with open('data/baidu-zhidao-paragraph.txt', 'wb') as f: count = 0 try: for paragraph in Session.query(Paragraph): lines = [paragraph.question.title, '\n'] for reply in paragraph.reply: lines.append(reply.content) lines.append('\n') lines.append('\n') f.writelines([s.encode('utf-8') for s in lines]) count += 1 print count except: print 'error, count %d' % count traceback.print_exc()