Ejemplo n.º 1
0
def is_visited(question_id):
    visited = True
    if question_id is not None:
        try:
            visited = (Session.query(Question).filter_by(
                question_id=question_id).count() != 0)
        except:
            logger.error('fail to query question_id %s', question_id,
                         exc_info=True)
    return visited
Ejemplo n.º 2
0
def get_analyzed_result(question_text):
    if question_text is None:
        return None
    md5_string = md5(question_text)
    ltp_result = Session.query(LtpResult).filter_by(md5=md5_string).first()
    if ltp_result is not None:
        analyzed_result = AnalyzedSentence(md5_string, ltp_result.json_text)
    else:
        try:
            result_json = analyze(question_text)
        except RuntimeError:
            logger.error('fail to invoke ltp api, text=%s', question_text,
                         exc_info=True)
            raise RuntimeError()

        save_analyzed_result(md5_string, result_json)
        analyzed_result = AnalyzedSentence(md5_string, result_json)
    return analyzed_result
Ejemplo n.º 3
0
def k_fold_cross_dataset(k, num):
    """Generate k-fold cross test set and train set.

    Example:
        In: k_fold_cross_dataset(2, 10)
        Out:
            [
                {
                    'test_text': 'data/2-fold-cross-10-test-text-1.txt',
                    'test_label': 'data/2-fold-cross-10-test-label-1.txt',
                    'train_text': 'data/2-fold-cross-10-train-text-1.txt',
                    'train_label': 'data/2-fold-cross-10-train-label-1.txt',
                },
                {
                    'test_text': 'data/2-fold-cross-10-test-text-2.txt',
                    'test_label': 'data/2-fold-cross-10-test-label-2.txt',
                    'train_text': 'data/2-fold-cross-10-train-text-2.txt',
                    'train_label': 'data/2-fold-cross-10-train-label-2.txt',
                }
            ]

    :param k: int
    :param num: int
    :return: list :raise RuntimeError:
    """
    prefix = 'data/{k}-fold-cross-{num}'.format(k=k, num=num) 
    file_pattern = '{prefix}-{{type}}-{{{{i}}}}.txt'.format(prefix=prefix)
    test_text_file_pattern = file_pattern.format(type='test-text')
    test_label_file_pattern = file_pattern.format(type='test-label')
    train_text_file_pattern = file_pattern.format(type='train-text')
    train_label_file_pattern = file_pattern.format(type='train-label')
    file_names = []
    for i in range(0, k):
        test_text_file = test_text_file_pattern.format(i=i)
        test_label_file = test_label_file_pattern.format(i=i)
        train_text_file = train_text_file_pattern.format(i=i)
        train_label_file = train_label_file_pattern.format(i=i)
        file_names.append({
            'test_text': test_text_file,
            'test_label': test_label_file,
            'train_text': train_text_file,
            'train_label': train_label_file})
    exist = True
    for i in file_names:
        for file_ in i.itervalues():
            if not os.path.isfile(file_):
                exist = False
                break
    if not exist:
        filtered_paragraphs = Session.query(FilteredParagraph).limit(num).all()
        if len(filtered_paragraphs) != num:
            raise RuntimeError()
        random.shuffle(filtered_paragraphs)
        folds = [[] for i in range(0, k)]
        for i in range(0, num):
            folds[i % k].append(filtered_paragraphs[i].paragraph)
        for i in range(0, k):
            test_text_file = file_names[i]['test_text']
            test_label_file = file_names[i]['test_label']
            train_text_file = file_names[i]['train_text']
            train_label_file = file_names[i]['train_label']
            test_set = folds[i]
            train_set = []
            for j in range(0, k):
                if j != i:
                    train_set.extend(folds[j])
            # generate test set
            generate_dataset(test_set, test_text_file, test_label_file)
            # generate train set
            generate_dataset(train_set, train_text_file, train_label_file)
    return file_names            
Ejemplo n.º 4
0
#!/usr/bin/env python
# coding: utf-8
__author__ = 'wilfredwang'
from data_access import Session
from data_access import Paragraph
import traceback


with open('data/baidu-zhidao-paragraph.txt', 'wb') as f:
    count = 0
    try:
        for paragraph in Session.query(Paragraph):
            lines = [paragraph.question.title, '\n']
            for reply in paragraph.reply:
                lines.append(reply.content)
                lines.append('\n')
            lines.append('\n')
            f.writelines([s.encode('utf-8') for s in lines])
            count += 1
        print count
    except:
        print 'error, count %d' % count
        traceback.print_exc()