Example #1
0
    def run(self):
        while self.start_page < self.page:
            # 发送请求,获取响应
            response = parse_url(self.url, self.headers)
            # 处理响应结果
            response = re.sub('encoding="UTF-8.36kr"', '', response)
            # 解析响应结果
            title_list, content_url_list, information_list, next_page_url_list, page_max = self.parse_response(response)
            # 获得总页数
            self.page = int(page_max[0])
            for i in range(len(title_list)):
                # 发送请求获取帖子详情页数据
                tie_response = self.get_tie_content(content_url_list[i], 0)
                # 获取详情页数据
                tie_content, author, photo_url, information, reply_count, tie_next_page_url_list, tie_page_max = self.parse_tie_content(
                    tie_response)
                # 保存数据
                self.save_tie_reply(title_list[i], tie_content, author, photo_url, information, reply_count)
                print('帖子第1页以爬完')
                p = 1
                pp = 2
                # 获取详情页后面的数据
                while p < tie_page_max:
                    tie_response = self.get_tie_content(content_url_list[i], p)
                    tie_content, author, photo_url, information, reply_count, tie_next_page_url_list, page_max = self.parse_tie_content(
                        tie_response)
                    print('帖子第%d页以爬完' % pp)
                    pp += 1
                    p += 1
                    self.save_tie_reply(title_list[i], tie_content, author, photo_url, information, reply_count)

            self.url = 'http://jump.bdimg.com/mo/q----,sz@320_240-1-3---/m?kw=%E6%9F%AF%E5%8D%97&amp;lp=7202'
            self.url += next_page_url_list[0]
            print('第%d页爬完' % self.start_page)
            self.start_page += 1
Example #2
0
 def run(self):
     while True:
         # parse_url是封装好requests的一个函数,传入url和headers等信息可以获取响应
         response = parse_url(self.url, self.headers)
         response_dict = self.parse_response(response)
         self.download(response_dict)
         print('已下载15条热点')
Example #3
0
 def get_tie_content(self, content_url, i):
     # 获取详情页信息
     p = int(i) * 30
     # 拼接详情页分页url
     url = self.pares_url + content_url + '&pn=' + str(p)
     # 获取详情页数据
     tie_response = parse_url(url, self.headers)
     # 处理详情页数据
     tie_response = re.sub('encoding="UTF-8.36kr"', ' ', tie_response)
     return tie_response
Example #4
0
    def run(self):
        # parse_url是封装好requests的一个函数,传入url和headers等信息可以获取响应
        response_str = parse_url(self.url, self.headers)
        #获取数据,翻页url
        response_dict, total_page = self.parse_content(response_str)
        # 下载数据
        self.download(response_dict)

        print('第1页爬取完毕')

        page = 1
        # 根据总页数翻页
        while page < int(total_page):
            self.get_url(page)
            response_str = parse_url(self.url, self.headers)
            response_dict, total_page = self.parse_content(response_str)
            self.download(response_dict)
            page += 1
            print('第%d页爬取完毕' % page)
    def run(self):
        classify_list, original_list = get_classify(self.classify_url, self.headers)
        print(classify_list)
        for classify in classify_list:
            print('开始爬取%s分类的数据' % classify)
            self.get_url(classify)
            print(self.url)
            response_str = parse_url(self.url, self.headers)
            response_dict, total = self.parse_content(response_str)
            self.download(response_dict)
            print("第1页爬完..")
            i = 18

            while i < total:
                self.get_url(classify, i)
                response_str = parse_url(self.url, self.headers)
                response_dict, total = self.parse_content(response_str)
                self.download(response_dict)
                i += 18
                j = i / 18
                print("第%d页爬完" % j)
def get_classify(classify_url, headers):
    classify_list = []
    original_list = []
    response = parse_url(classify_url, headers)
    # 解析html
    selector = etree.HTML(response)
    '''/html/body/div[2]/div[1]/section[5]/div/ul/li[1]/a'''
    pp = '// ul [ @class = "type-list"] / li / a//@href'
    temp = selector.xpath(pp)
    for i in temp:
        classify = re.match(r'/tv/(\w+)', i).group(1)
        original_list.append(classify)
        if classify == 'british':
            classify = 'english'
        elif classify == 'korean':
            classify = 'korean_drama'
        elif classify == 'chinese':
            classify = 'domestic'
        elif classify == 'tvshow':
            classify = 'variety'
        classify_list.append(classify)

    return classify_list, original_list
Example #7
0
 def run(self):
     # parse_url是封装好requests的一个函数,传入url和headers等信息可以获取响应
     response = parse_url(self.url, self.headers)
     content_list = self.parse_response(response)
     for i in range(0, len(content_list)):
         print(content_list[i])