def parse(self, response): """ 解析关注、粉丝 :param response: :return: """ try: if response.status != 200: req_url = response.request.url err_urls.append(req_url) print "请求错误 : ", req_url, " 错误码 : ", response.status return data_item = response.meta[FLAG_DATA_ITEM] cookies = response.meta[FLAG_COOKIE] # 拿到JSON进行解析 s_json = response.text json_data = json.loads(s_json) data_item['fans'] = json_data['data']['total_subscribe_count'] data_item['clicks'] = json_data['data']['play_effective_count'] data_item['reads'] = json_data['data']['go_detail_count'] data_item.save() # url=get_video_count() # yield scrapy.Request(url=url, headers=BASE_HEAD, dont_filter=True, # cookies=BASE_COOKIES, # meta={FLAG_DATA_ITEM: data_item, FLAG_COOKIE: cookies}), callback=self.parse_play_article) except Exception, e: traceback.print_exc()
def parse(self, response): print "+++++++++++++++ start parse " + str( self.page_index) + "+++++++++++++++\n" if response.status != 200: req_url = response.request.url err_urls.append(req_url) print "请求错误 : ", req_url, " 错误码 : ", response.status return s_json = response.text # .decode('unicode_escape') # print "response : ", s_json status = json.loads(s_json)['status'] if not status: logging.error("==========状态异常 status is err ==========") return # 视频列表 vlist = json.loads(s_json)['data']['vlist'] # 处理 print "vlist len : " + str(len(vlist)) for item in vlist: bili_item = LxdzxBiliItem() for show_item in bili_show_list: try: key = show_item[0] bili_item[key] = str(item[key]) except Exception, e: traceback.print_exc() yield bili_item
def get_detail(self, response): # 关联 if response.status != 200: req_url = response.request.url err_urls.append(req_url) print "get_detail请求错误 : ", req_url, " 错误码 : ", response.status return parent_item = response.meta['item'] s_json = response.text.decode('unicode_escape') if len(s_json) <= 10: yield parent_item return gl_list = json.loads(s_json) # 关联list if len(gl_list): for item in gl_list: try: s_items = str(item).split(",") parent_item["gl_title"] = eval(str(s_items[2])) # 标题 parent_item[ "gl_url"] = "https://www.bilibili.com/video/av" + ( s_items[1]).strip() yield parent_item except Exception, e: traceback.print_exc()
def parse(self, response): """ 解析关注、粉丝 :param response: :return: """ try: if response.status != 200: req_url = response.request.url err_urls.append(req_url) print "请求错误 : ", req_url, " 错误码 : ", response.status return data_item = response.meta[FLAG_DATA_ITEM] mid = response.meta[FLAG_KEY_MID] # 拿到JSON进行解析 s_json = response.text json_data = json.loads(s_json) data_item['follows'] = json_data['data']['following'] # 关注 data_item['fans'] = json_data['data']['follower'] # 粉丝 url = get_play_article(mid) yield scrapy.Request(url=url, headers=BASE_HEAD, dont_filter=True, cookies=BASE_COOKIES, meta={ FLAG_DATA_ITEM: data_item, FLAG_KEY_MID: mid }, callback=self.parse_play_article) except Exception, e: traceback.print_exc()
def parse_play_article(self, response): """ 解析阅读、播放 :param response: :return: """ try: if response.status != 200: req_url = response.request.url err_urls.append(req_url) print "请求错误 : ", req_url, " 错误码 : ", response.status return data_item = response.meta[FLAG_DATA_ITEM] mid = response.meta[FLAG_KEY_MID] # 拿到JSON进行解析 s_json = response.text json_data = json.loads(s_json) data_item['clicks'] = json_data['data']['archive']['view'] # 点击量 data_item['reads'] = json_data['data']['article']['view'] # 阅读数 # 保存 # print data_item data_item.save() except Exception, e: traceback.print_exc()
def is_err_url(response): # 是否为错误URL if response.status != 200: req_url = response.request.url err_urls.append(req_url) print "请求错误 : ", req_url, " 错误码 : ", response.status return True else: return False