def is_visited(question_id):
    visited = True
    if question_id is not None:
        try:
            visited = (Session.query(Question).filter_by(
                question_id=question_id).count() != 0)
        except:
            logger.error('fail to query question_id %s', question_id,
                         exc_info=True)
    return visited
def save_analyzed_result(md5_string, result_json):
    ltp_result = LtpResult(md5_string,
                           json.dumps(result_json, ensure_ascii=False))
    Session.add(ltp_result)
    logger.info('start to insert ltp result, md5=%s', md5_string)
    try:
        Session.commit()
    except Exception:
        Session.rollback()
        logger.error('fail to insert', exc_info=True)
    logger.info('finished inserting ltp result')
def get_analyzed_result(question_text):
    if question_text is None:
        return None
    md5_string = md5(question_text)
    ltp_result = Session.query(LtpResult).filter_by(md5=md5_string).first()
    if ltp_result is not None:
        analyzed_result = AnalyzedSentence(md5_string, ltp_result.json_text)
    else:
        try:
            result_json = analyze(question_text)
        except RuntimeError:
            logger.error('fail to invoke ltp api, text=%s', question_text,
                         exc_info=True)
            raise RuntimeError()

        save_analyzed_result(md5_string, result_json)
        analyzed_result = AnalyzedSentence(md5_string, result_json)
    return analyzed_result
def k_fold_cross_dataset(k, num):
    """Generate k-fold cross test set and train set.

    Example:
        In: k_fold_cross_dataset(2, 10)
        Out:
            [
                {
                    'test_text': 'data/2-fold-cross-10-test-text-1.txt',
                    'test_label': 'data/2-fold-cross-10-test-label-1.txt',
                    'train_text': 'data/2-fold-cross-10-train-text-1.txt',
                    'train_label': 'data/2-fold-cross-10-train-label-1.txt',
                },
                {
                    'test_text': 'data/2-fold-cross-10-test-text-2.txt',
                    'test_label': 'data/2-fold-cross-10-test-label-2.txt',
                    'train_text': 'data/2-fold-cross-10-train-text-2.txt',
                    'train_label': 'data/2-fold-cross-10-train-label-2.txt',
                }
            ]

    :param k: int
    :param num: int
    :return: list :raise RuntimeError:
    """
    prefix = 'data/{k}-fold-cross-{num}'.format(k=k, num=num) 
    file_pattern = '{prefix}-{{type}}-{{{{i}}}}.txt'.format(prefix=prefix)
    test_text_file_pattern = file_pattern.format(type='test-text')
    test_label_file_pattern = file_pattern.format(type='test-label')
    train_text_file_pattern = file_pattern.format(type='train-text')
    train_label_file_pattern = file_pattern.format(type='train-label')
    file_names = []
    for i in range(0, k):
        test_text_file = test_text_file_pattern.format(i=i)
        test_label_file = test_label_file_pattern.format(i=i)
        train_text_file = train_text_file_pattern.format(i=i)
        train_label_file = train_label_file_pattern.format(i=i)
        file_names.append({
            'test_text': test_text_file,
            'test_label': test_label_file,
            'train_text': train_text_file,
            'train_label': train_label_file})
    exist = True
    for i in file_names:
        for file_ in i.itervalues():
            if not os.path.isfile(file_):
                exist = False
                break
    if not exist:
        filtered_paragraphs = Session.query(FilteredParagraph).limit(num).all()
        if len(filtered_paragraphs) != num:
            raise RuntimeError()
        random.shuffle(filtered_paragraphs)
        folds = [[] for i in range(0, k)]
        for i in range(0, num):
            folds[i % k].append(filtered_paragraphs[i].paragraph)
        for i in range(0, k):
            test_text_file = file_names[i]['test_text']
            test_label_file = file_names[i]['test_label']
            train_text_file = file_names[i]['train_text']
            train_label_file = file_names[i]['train_label']
            test_set = folds[i]
            train_set = []
            for j in range(0, k):
                if j != i:
                    train_set.extend(folds[j])
            # generate test set
            generate_dataset(test_set, test_text_file, test_label_file)
            # generate train set
            generate_dataset(train_set, train_text_file, train_label_file)
    return file_names            
#!/usr/bin/env python
# coding: utf-8
__author__ = 'wilfredwang'
from data_access import Session
from data_access import Paragraph
import traceback


with open('data/baidu-zhidao-paragraph.txt', 'wb') as f:
    count = 0
    try:
        for paragraph in Session.query(Paragraph):
            lines = [paragraph.question.title, '\n']
            for reply in paragraph.reply:
                lines.append(reply.content)
                lines.append('\n')
            lines.append('\n')
            f.writelines([s.encode('utf-8') for s in lines])
            count += 1
        print count
    except:
        print 'error, count %d' % count
        traceback.print_exc()
    def extract(self, target):
        logger.info('check whether visited')
        matched_result = re.findall(r'/(\d+).html', target)
        if len(matched_result) == 0:
            logger.error('invalid question page url %s', target)
            return
        question_id = matched_result[0]
        if is_visited(question_id):
            logger.info('%s is visited, skip', question_id)
            return
        page = self.get_page(target, delay=True)
        if page is None:
            logger.info('page is none, skip')
            return
        # save question
        anchor = page.find('a', {'alog-alias': 'qb-class-info'})
        if anchor is None:
            if page.find('title', text=u'百度--您的访问出错了') is None:
                logger.error('invalid question page %s', target)
            else:
                logger.error('auth page, set exit signal')
                self.exit_signal.set()
            return
        category_url = to_unicode(anchor['href'])
        category_id = re.findall(r'/(\d+)', category_url)[0]
        title = get_title(page)
        if title is None:
            logger.error('fail to get title in %s', target)
            return
        question = Question(question_id, category_id, title)
        Session.add(question)
        logger.info('start to insert %s', question)
        try:
            Session.commit()
        except:
            logger.error('fail to insert %s, rollback', question, exc_info=True)
            Session.rollback()
            return
        logger.info('finished inserting question')
        while not self.exit_signal.isSet() and page:
            for line_content_div in page.find_all('div', 'line content'):
                # answer only, skip
                if line_content_div.find('dt', 'ask f-12 grid') is None:
                    continue
                # generate paragraph
                paragraph = Paragraph(question_id)
                # generate reply
                a_content = line_content_div.find('pre', {'accuse': 'aContent'})
                if a_content is None:
                    logger.error('can not find aContent, structure changed')
                    break
                reply = to_unicode(a_content.strings)
                paragraph.replies.append(Reply(1, reply))
                for pre in line_content_div.find_all('pre'):
                    pre_accuse = pre.get('accuse', 'no')
                    if pre_accuse == 'aRA':
                        reply = to_unicode(pre.strings)
                        paragraph.replies.append(Reply(1, reply))
                    elif pre_accuse == 'qRA':
                        reply = to_unicode(pre.strings)
                        paragraph.replies.append(Reply(0, reply))
                Session.add(paragraph)
                logger.info('start to insert paragraph(%d replies)',
                            len(paragraph.replies))
                try:
                    Session.commit()
                except:
                    logger.error('fail to insert %s, rollback', paragraph,
                                 exc_info=True)
                    Session.rollback()
                logger.info('finished inserting paragraph')

            next_page_link = get_next_page_link(page)
            page = self.get_page(next_page_link, delay=True)
        logger.info('finished extracting paragraph in %s', target)
 def cleanup(self):
     Session.remove()