def extract(self, target):
     """Get unsolved question url whose answer num > 0
     :param target:
     """
     page = self.get_page(target)
     while not self.exit_signal.isSet() and page is not None:
         for answer_num_div in page.find_all(
                 'div', 'f-12 f-light question-answer-num'):
             matched_result = re.findall(NUM_RE, answer_num_div.string)
             answer_num = int(matched_result[0]) if len(matched_result) > 0 else 0
             if answer_num == 0:
                 continue
             title_container_div = answer_num_div.find_previous_sibling(
                 'div', {'class': 'title-container'})
             question_anchor = title_container_div.find(
                 'a', {'class': 'question-title'})
             question_url = to_unicode(question_anchor['href'])
             logger.info("start to put '%s' into queue(%d), "
                         "%d answer", question_url, self.queue
                         .qsize(), answer_num)
             self.queue.put(question_url)
             logger.info("finished putting '%s' into queue(%d)",
                         question_url, self.queue.qsize())
         next_page_link = get_next_page_link(page)
         if next_page_link is not None:
             page = self.get_page(next_page_link)
         else:
             return
 def extract(self, target):
     page = self.get_page(target)
     if page is None:
         return
     logger.info('start to extract sub category in %s', target)
     category_list = []
     for category_item in page.find_all('li', 'category-item'):
         for anchor in category_item.find_all('a'):
             name = to_unicode(anchor.string)
             if name is None:
                 name = to_unicode(name.strings)
             name = re.sub(r'\xa0.+', '', name)
             url = to_unicode(anchor['href'])
             category_list.append((name, url))
     write_csv(SUB_CATEGORY_CSV, 'ab', category_list)
     logger.info('finished extracting sub category in %s', target)
def get_title(page):
    title = None
    title_span = page.find('span', 'ask-title')

    if title_span is not None:
        title = title_span.string
        if title is None:
            anchor = title_span.find('a', 'g-zhima-tag')
            if anchor is not None:
                title = anchor.next_sibling
    return to_unicode(title)
 def extract_top_category(self):
     target = '/browse'
     page = self.get_page(target)
     if page is None:
         return
     logger.info('start to extract top category in %s', target)
     category_url_list = []
     for category_item in page.find_all('li', 'category-item'):
         anchor = category_item.a
         if anchor is None:
             continue
         category_name = to_unicode(anchor.string)
         category_url = to_unicode(anchor['href'])
         if category_name and category_url:
             category_url_list.append((category_name, category_url))
     if len(category_url_list) == 0:
         logger.error('no category found')
         return
     write_csv(TOP_CATEGORY_CSV, 'wb', category_url_list)
     logger.info("finished extracting top category into '%s'",
                 TOP_CATEGORY_CSV)
 def extract(self, target):
     page = self.get_page(target)
     while not self.exit_signal.isSet() and page:
         for question_anchor in page.find_all('a', 'ti t-ie6'):
             question_url = to_unicode(question_anchor['href'])
             logger.info("start to put '%s' into queue(%d)",
                         question_url, self.queue.qsize())
             self.queue.put(question_url)
             logger.info("finished putting '%s' into queue(%d)",
                         question_url, self.queue.qsize())
         next_page_link = get_next_page_link(page)
         if next_page_link is not None:
             page = self.get_page(next_page_link)
         else:
             return
def get_next_page_link(page):
    if page is not None:
        next_page_anchor = page.find('a', 'pager-next')
        if next_page_anchor:
            return to_unicode(next_page_anchor['href'])
    return None
    def extract(self, target):
        logger.info('check whether visited')
        matched_result = re.findall(r'/(\d+).html', target)
        if len(matched_result) == 0:
            logger.error('invalid question page url %s', target)
            return
        question_id = matched_result[0]
        if is_visited(question_id):
            logger.info('%s is visited, skip', question_id)
            return
        page = self.get_page(target, delay=True)
        if page is None:
            logger.info('page is none, skip')
            return
        # save question
        anchor = page.find('a', {'alog-alias': 'qb-class-info'})
        if anchor is None:
            if page.find('title', text=u'百度--您的访问出错了') is None:
                logger.error('invalid question page %s', target)
            else:
                logger.error('auth page, set exit signal')
                self.exit_signal.set()
            return
        category_url = to_unicode(anchor['href'])
        category_id = re.findall(r'/(\d+)', category_url)[0]
        title = get_title(page)
        if title is None:
            logger.error('fail to get title in %s', target)
            return
        question = Question(question_id, category_id, title)
        Session.add(question)
        logger.info('start to insert %s', question)
        try:
            Session.commit()
        except:
            logger.error('fail to insert %s, rollback', question, exc_info=True)
            Session.rollback()
            return
        logger.info('finished inserting question')
        while not self.exit_signal.isSet() and page:
            for line_content_div in page.find_all('div', 'line content'):
                # answer only, skip
                if line_content_div.find('dt', 'ask f-12 grid') is None:
                    continue
                # generate paragraph
                paragraph = Paragraph(question_id)
                # generate reply
                a_content = line_content_div.find('pre', {'accuse': 'aContent'})
                if a_content is None:
                    logger.error('can not find aContent, structure changed')
                    break
                reply = to_unicode(a_content.strings)
                paragraph.replies.append(Reply(1, reply))
                for pre in line_content_div.find_all('pre'):
                    pre_accuse = pre.get('accuse', 'no')
                    if pre_accuse == 'aRA':
                        reply = to_unicode(pre.strings)
                        paragraph.replies.append(Reply(1, reply))
                    elif pre_accuse == 'qRA':
                        reply = to_unicode(pre.strings)
                        paragraph.replies.append(Reply(0, reply))
                Session.add(paragraph)
                logger.info('start to insert paragraph(%d replies)',
                            len(paragraph.replies))
                try:
                    Session.commit()
                except:
                    logger.error('fail to insert %s, rollback', paragraph,
                                 exc_info=True)
                    Session.rollback()
                logger.info('finished inserting paragraph')

            next_page_link = get_next_page_link(page)
            page = self.get_page(next_page_link, delay=True)
        logger.info('finished extracting paragraph in %s', target)