Ejemplo n.º 1
0
    def parse_question(self, content):
        """ 解析question页面
        """
        html = etree.HTML(content)

        question_tag = []
        question_title = question_text = follower_count = ''
        try:
            question_tag = [
                i.strip()
                for i in html.xpath('//a[@class="zm-item-tag"]/text()')
            ] or []
            question_title_xpath = html.xpath(
                '//h2[@class="zm-item-title"]/span/text()')
            question_text_xpath = html.xpath(
                '//div[@class="zm-editable-content"]/text()')
            follower_count_xpath = html.xpath(
                '//div[@class="zg-gray-normal"]/a/strong/text()')
            answer_count_xpath = html.xpath(
                '//h3[@id="zh-question-answer-num"]/text()')
            question_title = question_title_xpath[
                0] if question_title_xpath else ''
            question_text = question_text_xpath[
                0] if question_text_xpath else ''
            follower_count = follower_count_xpath[
                0] if follower_count_xpath else 0
            _answer_count = answer_count_xpath[0] if answer_count_xpath else ''
        except Exception, e:
            log.error(
                'question_parse_question: question_url={} except={}'.format(
                    self.question_url, str(e)))
Ejemplo n.º 2
0
def crawl(url):
    """ 返回下载页面
    """
    session = cookie.get_cookie()
    try:
        resp = session.get(url, headers=headers.get_headers())
    except Exception, e:
        log.error('util_crawler: crawl {} except={}'.format(url, str(e)))
        return
Ejemplo n.º 3
0
def insert_question_id(questions_id, topic_id):
    """ mysql question表存储question_id
    """
    in_sql = 'INSERT INTO question (question_id, topic_id) VALUES (%s, %s);'
    values = (questions_id, topic_id)
    try:
        MYSQLHANDLER.insert(in_sql, values)
    except Exception, e:
        log.error('main_main insert_question_id {}-{} except={}'.format(topic_id, questions_id, str(e)))
Ejemplo n.º 4
0
def redis_check(table, table_id):
    """ 判断question或answer是否已存在
    """
    if table == 'question':
        return REDIS_CLI.sismember('zhihu_topic_question_id', str(table_id))
    elif table == 'answer':
        return REDIS_CLI.sismember('zhihu_topic_answer_id', str(table_id))
    else:
        log.error('main_redis_check illegal table')
        return False
Ejemplo n.º 5
0
 def crawl_question(self):
     """ 根据question_url抓取question页面
     """
     self.question_url = self.get_question_url()
     content = crawler.crawl(self.question_url)
     #with open('./data/questions/38589246.html', 'rb') as _r:
     #    content = _r.read()
     if not content:
         log.error('question_crawl_question: content is None')
         return
     self.webpage_save(content)
     return self.parse_question(content)
Ejemplo n.º 6
0
def update_question_item(questions_id, q_item, topic_id):
    """ mysql question表存储question其他项目
    """
    up_sql = 'UPDATE question SET question_url=%s, question_title=%s, question_text=%s, follower_count=%s,\
        scan_count=%s, answer_count=%s, question_tag=%s WHERE question_id=%s AND topic_id=%s;'
    values = (q_item.get('question_url'), q_item.get('question_title'), q_item.get('question_text'),
              q_item.get('follower_count'), q_item.get('scan_count'), q_item.get('answer_count'),
              '|'.join(q_item.get('question_tag')), questions_id, topic_id)
    try:
        MYSQLHANDLER.insert(up_sql, values)
    except Exception, e:
        log.error('main_main update_question_item {}-{} except={}'.format(topic_id, questions_id, str(e)))
Ejemplo n.º 7
0
 def get_topic_max_page(self):
     """ 根据入口链接找出此topic下页面总数
     """
     url = crawl_topic_url.format(self.topic_id)
     content = crawler.crawl(url)
     max_page = 0
     try:
         page_num = re.findall('\?page=(\d+)', content)
         if page_num:
             max_page = max(int(i) for i in page_num)
     except Exception, e:
         log.error('topic_topic: get_topic_max_page except={}'.format(e))
Ejemplo n.º 8
0
    def parse_answer(self, content_json):
        """ 解析answer数据
        """

        for d in content_json.get('data'):
            try:
                self.answer_id = d.get('id') or ''
                if not self.answer_id:
                    raise Exception
                self.author_name = d.get('author').get('name') or ''
                self.author_domain = d.get('author').get('url_token') or ''
                self.author_type = d.get('author').get('type') or ''
                self.author_headline = d.get('author').get('headline') or ''
                self.author_id = d.get('author').get('id') or ''
                self.content = d.get('content')
                self.voteup_count = d.get('voteup_count') or 0
                self.comment_count = d.get('comment_count') or 0
                self.answer_url = answer_url.format(self.question_id,
                                                    self.answer_id)

                _answer_updated_time = d.get('updated_time') or 0
                _answer_create_time = d.get('created_time') or 0
                if _answer_updated_time:
                    self.answer_updated_time = datetime.datetime.fromtimestamp(
                        int(_answer_updated_time)).strftime(
                            '%Y-%m-%d %H:%M:%S')
                if _answer_create_time:
                    self.answer_create_time = datetime.datetime.fromtimestamp(
                        int(_answer_create_time)).strftime('%Y-%m-%d %H:%M:%S')
                _answer = {
                    'question_id': self.question_id,
                    'answer_id': self.answer_id,
                    'answer_url': self.answer_url,
                    'author_name': self.author_name,
                    'author_domain': self.author_domain,
                    'author_type': self.author_type,
                    'author_headline': self.author_headline,
                    'author_id': self.author_id,
                    'content': self.content,
                    'answer_updated_time': self.answer_updated_time,
                    'answer_create_time': self.answer_create_time,
                    'voteup_count': self.voteup_count,
                    'comment_count': self.comment_count
                }
                self.answers.append(_answer)
            except Exception, e:
                log.error(
                    'answer_parse_anwser: question_id={} except={}'.format(
                        self.question_id, str(e)))
Ejemplo n.º 9
0
def crawl_question():
    """ question抓取
    """
    ## 获取topic
    topics = get_topic_mysql('crawled')
    for topic_id in topics:
        saved_questions_id = get_exists_question_id(topic_id, q_type='empty')
        for q_id in saved_questions_id:
            question_item = Question.from_question(q_id, topic_id).crawl_question()
            if not question_item:
                log.error('main_crawl_question: question_item is None question_id={}'.format(q_id))
                break
                continue
            update_question_item(q_id, question_item, topic_id)
            time.sleep(0.2)
Ejemplo n.º 10
0
def get_exists_question_id(topic_id, q_type='all'):
    """ 查看mysql中已经保存的question_id
    """
    questions = list()
    if q_type == 'all':
        se_sql = 'SELECT question_id FROM question WHERE topic_id=%s;'
    elif q_type == 'normal':
        se_sql = 'SELECT question_id FROM question WHERE topic_id=%s AND question_title IS NOT NULL;'
    elif q_type == 'empty':
        se_sql = 'SELECT question_id FROM question WHERE topic_id=%s AND question_title IS NULL;'
    else:
        log.error('main_get_exists_question_id: qType = {}'.format(q_type))
    values = (topic_id, )
    infos = MYSQLHANDLER.select(se_sql, values)
    for info in infos:
        questions.append(info.get('question_id'))
    return list(set(questions))
Ejemplo n.º 11
0
def insert_answer_item(a_items):
    """ mysql answer表存储answer项目
    """
    in_sql = 'INSERT INTO answer (answer_id, answer_url, question_id, author_name, author_domain, author_type, author_headline,\
            author_id, content, answer_updated_time, answer_create_time, voteup_count, comment_count)\
            VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=NOW();'
    for a_item in a_items:
        ## 判断answer是否已存在
        if redis_check('answer', a_item.get('answer_id')):
            continue
        values = (a_item.get('answer_id'), a_item.get('answer_url'), a_item.get('question_id'), a_item.get('author_name'),
                  a_item.get('author_domain'), a_item.get('author_type'), a_item.get('author_headline'),
                  a_item.get('author_id'), a_item.get('content'), a_item.get('answer_updated_time'), a_item.get('answer_create_time'),
                  a_item.get('voteup_count'), a_item.get('comment_count'))
        try:
            MYSQLHANDLER.insert(in_sql, values)
        except Exception, e:
            log.error('main_main insert_answer_item {}-{} except={}'.format(a_item.get('question_id'), a_item.get('answer_id'), str(e)))
        REDIS_CLI.sadd('zhihu_topic_answer_id', str(a_item.get('answer_id')))
Ejemplo n.º 12
0
    def call_questionAPI(self):
        """ 根据questionAPI获取answer数据
        """
        while True:
            url_api = self.get_questionAPI_url()
            #print url_api

            content = crawler.crawl(url_api)
            time.sleep(0.2)
            if not content:
                log.error(
                    'answer_call_questionAPI: url_api={}'.format(url_api))
                break
            try:
                content_json = json.loads(content)
            except Exception, e:
                log.error(
                    'answer_call_questionAPI: except={} url_api={}'.format(
                        str(e), url_api))
            if not content_json or not content_json.get('data') or len(
                    content_json.get('data')) == 0:
                break
            self.parse_answer(content_json)
Ejemplo n.º 13
0
import os
import sys
import requests

reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append(os.path.abspath('..'))

from util._log import log
from util import common
from util import headers
from util import cookie
""" 本程序用以下载页面
"""


def crawl(url):
    """ 返回下载页面
    """
    session = cookie.get_cookie()
    try:
        resp = session.get(url, headers=headers.get_headers())
    except Exception, e:
        log.error('util_crawler: crawl {} except={}'.format(url, str(e)))
        return
    if resp.status_code != 200:
        log.error('util_crawler: resp.status_code={}'.format(resp.status_code))
        return
    resp.encoding = 'utf-8'
    return resp.text