Ejemplo n.º 1
0
def save_analyzed_result(md5_string, result_json):
    ltp_result = LtpResult(md5_string,
                           json.dumps(result_json, ensure_ascii=False))
    Session.add(ltp_result)
    logger.info('start to insert ltp result, md5=%s', md5_string)
    try:
        Session.commit()
    except Exception:
        Session.rollback()
        logger.error('fail to insert', exc_info=True)
    logger.info('finished inserting ltp result')
Ejemplo n.º 2
0
    def extract(self, target):
        logger.info('check whether visited')
        matched_result = re.findall(r'/(\d+).html', target)
        if len(matched_result) == 0:
            logger.error('invalid question page url %s', target)
            return
        question_id = matched_result[0]
        if is_visited(question_id):
            logger.info('%s is visited, skip', question_id)
            return
        page = self.get_page(target, delay=True)
        if page is None:
            logger.info('page is none, skip')
            return
        # save question
        anchor = page.find('a', {'alog-alias': 'qb-class-info'})
        if anchor is None:
            if page.find('title', text=u'百度--您的访问出错了') is None:
                logger.error('invalid question page %s', target)
            else:
                logger.error('auth page, set exit signal')
                self.exit_signal.set()
            return
        category_url = to_unicode(anchor['href'])
        category_id = re.findall(r'/(\d+)', category_url)[0]
        title = get_title(page)
        if title is None:
            logger.error('fail to get title in %s', target)
            return
        question = Question(question_id, category_id, title)
        Session.add(question)
        logger.info('start to insert %s', question)
        try:
            Session.commit()
        except:
            logger.error('fail to insert %s, rollback', question, exc_info=True)
            Session.rollback()
            return
        logger.info('finished inserting question')
        while not self.exit_signal.isSet() and page:
            for line_content_div in page.find_all('div', 'line content'):
                # answer only, skip
                if line_content_div.find('dt', 'ask f-12 grid') is None:
                    continue
                # generate paragraph
                paragraph = Paragraph(question_id)
                # generate reply
                a_content = line_content_div.find('pre', {'accuse': 'aContent'})
                if a_content is None:
                    logger.error('can not find aContent, structure changed')
                    break
                reply = to_unicode(a_content.strings)
                paragraph.replies.append(Reply(1, reply))
                for pre in line_content_div.find_all('pre'):
                    pre_accuse = pre.get('accuse', 'no')
                    if pre_accuse == 'aRA':
                        reply = to_unicode(pre.strings)
                        paragraph.replies.append(Reply(1, reply))
                    elif pre_accuse == 'qRA':
                        reply = to_unicode(pre.strings)
                        paragraph.replies.append(Reply(0, reply))
                Session.add(paragraph)
                logger.info('start to insert paragraph(%d replies)',
                            len(paragraph.replies))
                try:
                    Session.commit()
                except:
                    logger.error('fail to insert %s, rollback', paragraph,
                                 exc_info=True)
                    Session.rollback()
                logger.info('finished inserting paragraph')

            next_page_link = get_next_page_link(page)
            page = self.get_page(next_page_link, delay=True)
        logger.info('finished extracting paragraph in %s', target)