Ejemplo n.º 1
0
 def crawl_topic_page(self, max_page):
     """ 分页面下载
     """
     if not max_page or max_page == 0:
         url = crawl_topic_url.format(self.topic_id)
         content = crawler.crawl(url)
         if not content:
             return
         file_name = '{}/data/topics/{}/top-answers_page_1.html'.format(
             os.path.abspath('.'), self.topic_id)
         self.webpage_save(file_name, content)
         self.extract_question_id(content)
     else:
         for i in range(1, max_page + 1):
             ## 全部回答页面抓取,数量较大,以生活话题为例,有近100000个回答,其中大多数问题,关注人数较少
             #file_name = '{}/data/topics/{}/page_{}.html'.format(os.path.abspath('.'), self.topic_id, i)
             ## 精华回答抓取,最多为50个页面,每个页面20个回答,即最多共1000个精华回答,其中所属的问题会有重复
             file_name = '{}/data/topics/{}/top-answers_page_{}.html'.format(
                 os.path.abspath('.'), self.topic_id, i)
             if os.path.exists(file_name):
                 with open(file_name, 'rb') as _r:
                     content = _r.read()
             else:
                 url = page_topic_url.format(self.topic_id, i)
                 content = crawler.crawl(url)
                 time.sleep(0.5)
                 if not content:
                     continue
                 self.webpage_save(file_name, content)
             self.extract_question_id(content)
Ejemplo n.º 2
0
 def crawl_topic_page(self, max_page):
     """ 分页面下载
     """
     if not max_page or max_page == 0:
         url = crawl_topic_url.format(self.topic_id)
         content = crawler.crawl(url)
         if not content:
             return
         self.webpage_save(1, content)
         self.extract_question_id(content)
     else:
         for i in range(1, max_page + 1):
             url = page_topic_url.format(self.topic_id, i)
             content = crawler.crawl(url)
             if not content:
                 continue
             self.webpage_save(i, content)
             self.extract_question_id(content)
             time.sleep(0.5)
Ejemplo n.º 3
0
 def crawl_question(self):
     """ 根据question_url抓取question页面
     """
     self.question_url = self.get_question_url()
     content = crawler.crawl(self.question_url)
     #with open('./data/questions/38589246.html', 'rb') as _r:
     #    content = _r.read()
     if not content:
         return
     self.webpage_save(content)
     return self.parse_question(content)
Ejemplo n.º 4
0
 def get_topic_max_page(self):
     """ 根据入口链接找出此topic下页面总数
     """
     url = crawl_topic_url.format(self.topic_id)
     content = crawler.crawl(url)
     max_page = 0
     try:
         page_num = re.findall('\?page=(\d+)', content)
         if page_num:
             max_page = max(int(i) for i in page_num)
     except Exception, e:
         log.error('topic_topic: get_topic_max_page except={}'.format(e))
Ejemplo n.º 5
0
    def call_questionAPI(self):
        """ 根据questionAPI获取answer数据
        """
        while True:
            url_api = self.get_questionAPI_url()
            #print url_api

            content = crawler.crawl(url_api)
            time.sleep(0.2)
            if not content:
                log.error(
                    'answer_call_questionAPI: url_api={}'.format(url_api))
                break
            try:
                content_json = json.loads(content)
            except Exception, e:
                log.error(
                    'answer_call_questionAPI: except={} url_api={}'.format(
                        str(e), url_api))
            if not content_json or not content_json.get('data') or len(
                    content_json.get('data')) == 0:
                break
            self.parse_answer(content_json)