def parse(self, response): global title print(self.start) if len(self.start) > 1: print('from socket') items = BilibiliItem() items['file_urls'] = self.start[0:len(self.start) - 1] title = self.start[len(self.start) - 1] print(title, 'parse') yield items else: print('from json') with open(AbsDirectory.file_path + 'bilibili/bilibili/spiders/tomcat/long/long.json', 'r', encoding='utf-8') as f: json_list = json.load(f) print(json_list) print(json_list[len(json_list) - 1]) items = BilibiliItem() json_list.append('end') items['file_urls'] = json_list[0:len(json_list) - 1] f.close() os.remove(AbsDirectory.file_path + 'bilibili/bilibili/spiders/tomcat/long/long.json') Store.file_length = len(json_list) title = self.start[0] global urls urls = json_list yield items
def parse(self, response): final = self.get_video_url(response) logging.warning(response.headers) items = BilibiliItem() items['file_urls'] = [final] items_audio = BilibiliItem() items_audio['file_urls'] = [self.get_audio_url(response)] yield items yield items_audio
def parse_up(self, response): user = json.loads(response.text) if not user['status']: return user = user['data'] item = BilibiliItem() item['uid'] = user['mid'] item['name'] = user['name'] item['space'] = 'https://space.bilibili.com/' + item['uid'] item['sex'] = user['sex'] try: item['birthday'] = user['birthday'][-5:] except KeyError: item['birthday'] = '' try: item['address'] = user['place'] except KeyError: item['address'] = '' item['level'] = user['level_info']['current_level'] try: t = time.localtime(user['regtime']) item['regtime'] = time.strftime('%Y-%m-%d',t) except KeyError: item['regtime'] = '' item['fans'] = user['fans'] item['follows'] = user['attention'] item['playnum'] = user['playNum'] url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + item['uid'] yield Request(url, callback=self.parse_video, meta={'userdata': item})
def parse_2(self, response): global title title_l = response.xpath("//h1//text()").extract() if len(title_l) > 1: title = response.xpath("//h1//text()").extract()[len(title_l) - 1] else: title = title_l[0] urls = response.text.split('\\\"1080P60\\\"') if len(urls) == 1: urls = response.text.split('\\\"1080P\\\"') if len(urls) == 1: urls = response.text.split('\\\"超清\\\"') if len(urls) == 1: urls = response.text.split('\\\"高清\\\"') if len(urls) == 1: urls = response.text.split('\\\"标清\\\"') t = urls[1] raw_url = self.get_str(t, '[', ']') logging.warning(raw_url) a = BilibiliItem() # url ='https://ali-video.acfun.cn/mediacloud/acfun/acfun_video/segment/'+self.start_urls[0] a['file_urls'] = [raw_url] yield a
def parse_page(self, response): av = response.xpath(".//span[@class='type avid']/text()").get() print(response.request.headers['User-Agent']) print(av) if av == None: return try: href = response.xpath( "/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[1]/div/div[1]/a/@href" ).get() date = response.xpath( "/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[1]/div/div[3]/span[3]/text()" ).get() date = date.strip() # print(href)(/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[1]/div/div[1]/a) # time = response.xpath("/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li/a/div/span[1]/text()").get() time = response.xpath( "/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[1]/a/div/span[1]/text()" ).get() # print(time) item = BilibiliItem() item['time'] = time item['date'] = date r = re.findall('BV[0-9a-zA-Z]*', href) url2 = "https://api.bilibili.com/x/web-interface/view?&bvid=" + "".join( r) print(url2) yield scrapy.Request(url2, callback=self.parse, dont_filter=True, meta={"item": item}) except: return
def parse_detail(self, response): item = BilibiliItem() item_brief_list=['badge','badge_type','is_finish','media_id','index_show','season_id','title'] item_order_list=['follow','play','pub_date','pub_real_time','renewal_time','score'] m=response.meta for key in item_brief_list: if (key in m): item[key]=m[key] else: item[key]="" for key in item_order_list: if (key in m['order']): item[key]=m['order'][key] else: item[key]="" tags=response.xpath('//*[@class="media-tag"]/text()').extract() tags_string='' for t in tags: tags_string=tags_string+" "+t item['tags']=tags_string item['brief'] = response.xpath('//*[@name="description"]/attribute::content').extract() detail_text = response.xpath('//script')[4].extract() actor_p = re.compile('actors":(.*?),') ratings_count_p = re.compile('count":(.*?),') staff_p = re.compile('staff":(.*?),') item['cv'] = re.findall(actor_p,detail_text)[0] item['staff'] = re.findall(staff_p,detail_text)[0] count_list=re.findall(ratings_count_p,detail_text) if(len(count_list)>0): item['count'] = count_list[0] else: item['count']=0 # self.log(item) return item
def parse(self, response): for x in os.listdir(AbsDirectory.file_path+'bilibili/bilibili/spiders/tomcat/full/'): os.remove(AbsDirectory.file_path+'bilibili/bilibili/spiders/tomcat/full/'+x) b = BilibiliItem() b['file_urls'] = ['https://www.acfun.cn/?pagelets=pagelet_game,pagelet_douga,pagelet_bangumi_list,pagelet_life,' 'pagelet_tech,pagelet_dance,pagelet_music,pagelet_film,pagelet_fishpond,pagelet_s' 'port&reqID=0&ajaxpipe=1&t=1582458727656'] yield b
def parse(self, response): for x in os.listdir(AbsDirectory.file_path + 'bilibili/bilibili/spiders/tomcat/full/'): os.remove(AbsDirectory.file_path + 'bilibili/bilibili/spiders/tomcat/full/' + x) b = BilibiliItem() url_info = 'https://www.acfun.cn/u/{}?quickViewId=ac-space-video-list&reqID=1&a' \ 'jaxpipe=1&type=video&order=newest&page={}&pageSize=20&t=1587549164677'.\ format(self.up_id, self.page_no) b['file_urls'] = [url_info] yield b
def parse_video(self, response): item = BilibiliItem() try: soup = BeautifulSoup(response.body) scriptText = soup.findAll('script')[3].get_text() except: print('soup里找不到东西') else: #title titles = re.findall(r'"title":".+?"', scriptText) title = titles[0].replace('"title":', '').replace('"', '') item['title'] = title #related relatedArr = [] for rawRelateTitle in titles[2:]: relatedtitle = rawRelateTitle.replace('"title":', '').replace('"', '') relatedArr.append(relatedtitle) item['related'] = relatedArr #aid aids = re.findall(r'"aid":\d+', scriptText) item['av'] = aids[0].replace('"aid":', '') #bvid bvids = re.findall(r'"bvid":"BV.+?"', scriptText) item['bv'] = bvids[0].replace('"bvid":', '').replace('"', '') #pic pics = re.findall(r'"pic":".+?"', scriptText) pic = pics[0].replace('"pic":', '').replace('"', '') item['pic'] = pic.encode('latin-1').decode('unicode_escape') #desc descs = re.findall(r'"desc":".+?"', scriptText) desc = descs[0].replace('"desc":', '').replace('"', '') item['desc'] = desc #partsDict matchLst = re.findall(r'"part":".*?"', scriptText) if matchLst != []: titleDic = {} for titleTxt, count in zip(matchLst, range(len(matchLst))): title = titleTxt.replace('"', '').replace('part:', '') titleDic[count] = title #此处yield的东西会出现在pipeline中 item['partsDict'] = titleDic #pubdate pubdate = re.findall(r'"pubdate":\d+', scriptText) item['pubdate'] = pubdate[0].replace('"pubdate":', '') #viewseo viewseo = re.findall(r'"viewseo":\d+', scriptText) item['viewseo'] = viewseo[0].replace('"viewseo":', '') #numOfComments numOfComments = re.findall(r'"reply":\d+', scriptText) item['numOfComments'] = numOfComments[0].replace( '"reply":', '') yield item
def parse_data(self, response): # print(response.text) get_data = lambda x: re.findall( "\d+", response.xpath(x)[0].attrib.get('title'))[0] if len( re.findall("\d+", response.xpath(x)[0].attrib.get('title')) ) != 0 else 0 get_subscriber = lambda x: response.xpath(x).extract()[0] if len( response.xpath(x).extract()) != 0 else 0 item = BilibiliItem() item['title'] = str( response.xpath('''//*[@id="viewbox_report"]/h1/span/text()'''). extract()[0]).replace("-", "").replace(r"\n", "").replace("\n", "").replace(" ", "") item['play_num'] = str( get_data('''//*[@id="viewbox_report"]/div[2]/span[1]''')).replace( "-", "").replace(r"\n", "").replace("\n", "").replace(" ", "") item['bullet_screen'] = str( get_data('''//*[@id="viewbox_report"]/div[2]/span[2]''')).replace( "-", "").replace(r"\n", "").replace("\n", "").replace(" ", "") # item['like'] = response.xpath('''//*[@id="arc_toolbar_report"]/div[1]/span[1]/text()''') item['like'] = str( get_data( '''//*[@id="arc_toolbar_report"]/div[1]/span[1]''')).replace( "-", "").replace(r"\n", "").replace("\n", "").replace(" ", "") item['coin'] = str( get_subscriber( '''//*[@id="arc_toolbar_report"]/div[1]/span[2]/text()''') ).replace("-", "").replace(r"\n", "").replace("\n", "").replace(" ", "") item['collect'] = str( get_subscriber( '''//*[@id="arc_toolbar_report"]/div[1]/span[3]/text()''') ).replace("-", "").replace(r"\n", "").replace("\n", "").replace(" ", "") # item['subscriber'] = response.xpath('''//*[@id="v_upinfo"]/div[3]/div[2]/span/span/text()''').extract()[0] item['subscriber'] = str( get_subscriber( '''//*[@id="v_upinfo"]/div[3]/div[2]/span/span/text()''') ).replace("-", "").replace(r"\n", "").replace("\n", "").replace(" ", "") item['author'] = str( response.xpath('''//*[@id="v_upinfo"]/div[2]/div/a[1]/text()'''). extract()[0]).replace("-", "").replace(r"\n", "").replace("\n", "").replace(" ", "") item['av_num'] = 0 return item
def parse_up_url(self, response): global totalPage totalPage = response.xpath( '//div[@class="ac-space-contribute-list"]//ul//li[@class="active"]//@data-count' ).extract()[0] totalPage = int(int(totalPage) / 20) + 1 s = 'https://www.acfun.cn/u/{}?quickViewId=ac-space-video-list&reqID=4&ajaxpipe=' \ '1&type=video&order=newest&page={}&pageSize=20&t=1587619209806'.format(up_id, 1) b = BilibiliItem() b['file_urls'] = [s] print(totalPage) yield b
def open_spider(self, spider): if not os.path.exists("spiders/yeyinfuStat"): os.mkdir("spiders/yeyinfuStat") self.f = open('spiders/yeyinfuStat/YeyinfuVideoTitles.txt', 'w') self.csvf = open('spiders/yeyinfuStat/YeyinfuVideoTitles.csv', 'w') self.itemf = open('spiders/yeyinfuStat/YeyinfuItems.csv', 'w') try: itemWriter = csv.writer(self.itemf) header = list(BilibiliItem().fields.keys()) itemWriter.writerow(header) except: pass
def parse(self, response): rank_list = response.xpath('..//li[@class="rank-item"]') print("-----") for rank_item in rank_list: item = BilibiliItem() item['title'] = rank_item.xpath( './div[@class="content"]/div[@class="info"]/a/text()' )[0].extract() item['href'] = rank_item.xpath( './div[@class="content"]/div[@class="info"]/a/@href' )[0].extract() yield item pass
def bilibili_parse(self, response): item = BilibiliItem() r = json.loads(response.body) if response.status == 200 and r['status'] != False: data = r['data'] for key in item.fields: if key == 'currentLevel': item[key] = data['level_info']['current_level'] else: item[key] = data[key] if key in data.keys() else 'no' yield item else: return
def parse_userinfo(self, response): ''' 爬取用户的mid name sex信息 ''' userinfo_resultJson = json.loads(response.body) userinfo_result = userinfo_resultJson['data'] item = BilibiliItem() item['_id'] = userinfo_result['mid'] item['name'] = userinfo_result['name'] item['level'] = userinfo_result['level'] item['coins'] = userinfo_result['coins'] item['sex'] = userinfo_result['sex'] yield item
def parse(self, response): item =BilibiliItem() list = response.xpath('//ul[@class="rank-list"]/li') print("len:",len(list)) for i in list: item["number"] = i.xpath('./div[@class="num"]/text()').extract()[0] print(type(item["number"])) item["title"] = i.xpath('.//div[@class="info"]/a/text()').extract()[0] url = i.xpath('.//div[@class="info"]/a/@href').extract()[0] item["url"] = url.split("//")[-1] item["grade"] = i.xpath('.//div[@class="pts"]/div/text()').extract()[0] item["play_number"] = i.xpath('.//div[@class="detail"]/span[1]/text()').extract()[0] item["comments"] = i.xpath('.//div[@class="detail"]/span[2]/text()').extract()[0] yield item
def next(self, response): print('请求成功') a = response.body.decode('utf-8', 'ignore') item = BilibiliItem() print(len(a)) pat1 = '"uname":"(.*?)"' pat2 = '"sex":"(.*?)"' pat3 = '"sign":"(.*?)"' pat4 = '"message":"(.*?)"' item['uname'] = re.compile(pat1).findall(a) item['usex'] = re.compile(pat2).findall(a) item['usign'] = re.compile(pat3).findall(a) item['ucomment'] = re.compile(pat4).findall(a) return item
def parse_item(self, response): div = response.xpath("//div[@id='viewbox_report']") # 标题 title = div.xpath(".//span/text()").get() # 视频分类 category = ">".join( div.xpath(".//span[@class='a-crumbs']/a/text()").getall()) # 发布时间 publish_time = div.xpath(".//div[1]/span[2]/text()").get() # 播放数 play_text = div.xpath(".//span[contains(@title, '播放数')]/text()").get() play_count = re.sub(r"播放.*", "", play_text) # 弹幕数 barrage_text = div.xpath( ".//span[contains(@title, '弹幕数')]/text()").get() barrage_count = re.sub(r"弹幕.*", "", barrage_text) # 点赞数、投硬数、收藏数 ops_list = [ x.strip() for x in response.xpath( "//div[@class='ops']/span/text()").getall() ] like_count = ops_list[0] if ops_list[0] != "点赞" else "0" throw_coin_count = ops_list[1] if ops_list[1] != "投币" else "0" collection_count = ops_list[2] if ops_list[2] != "收藏" else "0" # 评论数 comment_count = response.xpath( "//meta[@itemprop='commentCount']/@content").get() # 标签列表 tag_text = response.xpath( "//ul[contains(@class, 'tag-area')]/li//text()").getall() tag_names = ",".join(tag_text) info = { "title": title, "category": category, "publish_time": publish_time, "play_count": play_count, "barrage_count": barrage_count, "like_count": like_count, "throw_coin_count": throw_coin_count, "collection_count": collection_count, "comment_count": comment_count, "tag_names": tag_names } for k, v in info.copy().items(): if ("_count" in k) and ("万" in v): info[k] = int(float(v.replace("万", "")) * 10000) yield BilibiliItem(**info)
def parse_each_cartoon_comment(self, response): print('我他妈真的来这里了吗') all_comment = response.xpath( '//div[@class="review-list-wrp type-short"]//li') if all_comment: for item in all_comment: print('终于进来了!!!') Item = BilibiliItem() Item['username'] = item.xpath( './/div[@class="review-author-name"]//text()' ).extract_first().strip() Item['comment'] = item.xpath( './/div[@class="review-content"]//text()').extract_first( ).strip() yield Item
def parse(self, response): ul = response.xpath('//*[@class="rank-list"]') if not ul: # 信息获取失败输入日志 self.log("----------------------%s" % response.url) else: # 信息获取成功,输出日志 self.log("++++++++++++++++++++++%s" % response.url) # 根据li,获取li的list lis = ul[0].xpath('./li') items = [] for bilibili in lis[0:]: bilibi_item = BilibiliItem() try: # 编号 s_vnum = bilibili.xpath( './div[@class="num"]/text()').extract()[0] bilibi_item['vnum'] = s_vnum # 标题 s_title = bilibili.xpath( './div[@class="content"]/div[@class="info"]/a/text()' ).extract()[0] bilibi_item['vtitle'] = s_title # url地址 s_url = bilibili.xpath( './div[@class="content"]/div[@class="info"]/a/@href' ).extract()[0] #s_pic = bilibili.xpath('div[@class="content"]/div[@class="img"]/a/div/img/@src').extract()[0] bilibi_item['vurl'] = s_url # 综合评分 s_pts = bilibili.xpath( './div[@class="content"]/div[@class="info"]/div[@class="pts"]/div/text()' ).extract()[0] bilibi_item['vpts'] = s_pts # s_author = bilibili.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/a/span/@aid').extract()[0] # bilibi_item['author'] = s_id self.log('vnum=:' + bilibi_item['vnum']) self.log('vtitle=:' + bilibi_item['vtitle']) self.log('vurl=:' + bilibi_item['vurl']) self.log('vpts=:' + bilibi_item['vpts']) # print(bilibi_item) except IndexError as e: # 如果有个别信息为空获取失败,则输出日志 self.log("!!!!!!!!!!!" + str(e)) items.append(bilibi_item) yield items
def parse(self, response): # print(response.url) js = json.loads(response.body) # 判断code是否为0,如果不为0 就是不可以抓取或者不存在的视频 if js['code'] == 0: data = js['data'] if isinstance(data['view'], int) and data['view'] >= 1000: item = BilibiliItem() # 将data中的数据都传入item中,并向下继续请求第二级的链接,补全item for key in data.keys(): item[key] = data[key] # yield item yield scrapy.Request( 'https://www.bilibili.com/video/av{}'.format(data['aid']), meta={'item': item}, callback=self.parse_detail)
def parse(self, response): temp = response.body temp = re.search('({.+})', temp).group(1) datas = json.loads(temp)['result'] item = BilibiliItem() if datas: for data in datas: item['title'] = data['title'] item['author'] = data['author'] item['play_count'] = data['play'] item['danmu_count'] = data['video_review'] item['arcurl'] = data['arcurl'] yield item page_num = re.search(r'page=(\d+)', response.url).group(1) page_num = 'page=' + str(int(page_num)+1) next_url = re.sub(r'page=\d+', page_num, response.url) yield Request(next_url, headers=self.headers)
def parse2(self, response): response = response.text response = json.loads(response) mid = response["data"]["mid"] username = response["data"]["name"] head_img = response["data"]["face"] register_time = response["data"]["regtime"] birthday = response["data"]["birthday"] place = response["data"]["place"] # print(mid,username,head_img,register_time,birthday,place) Item = BilibiliItem() Item["userid"] = mid Item["username"] = username Item["head_img"] = head_img Item["register_time"] = register_time Item["birthday"] = birthday Item["place"] = place yield Item
def parse_item(self, response): print "*" * 20, response.url sel = Selector(response) item = BilibiliItem() item["url"] = response.url item["crawl_time"] = int(time.time()) item["title"] = ''.join(sel.xpath("//title/text()").extract()) item["keywords"] = ''.join( sel.xpath('//meta[@name="keywords"]/@content').extract()) item["description"] = ''.join( sel.xpath('//meta[@name="description"]/@content').extract()) item["author"] = ''.join( sel.xpath('//meta[@name="author"]/@content').extract()) item["cover_image"] = ''.join( sel.xpath('//img[@class="cover_image"]/@src').extract()) item["h_title"] = ''.join(sel.xpath('//h1/@title').extract()) item["startDate"] = ''.join( sel.xpath('//time[@itemprop="startDate"]/@datetime').extract()) item["info"] = self.extract_info(sel) item["upinfo"] = self.extract_upinfo(sel) item["video_info"] = self.extract_video_info(sel) item["tag_list"] = self.extract_tag_list(sel) # cid aid m = re.findall(r"cid=([0-9]+)\&aid=([0-9]+)", response.body) print m if len(m) == 1: item["cid"] = m[0][0] item["aid"] = m[0][1] item["stats"] = self.extract_stats(sel, item["aid"], item) json_data = json.loads(item["stats"]) if "data" in json_data: json_data2 = json_data["data"] item["view"] = json_data2["view"] item["danmaku"] = json_data2["danmaku"] item["reply"] = json_data2["reply"] item["favorite"] = json_data2["favorite"] item["coin"] = json_data2["coin"] item["share"] = json_data2["share"] else: print response.url, "cid wrong" return item
def parse(self, response): """获取ep_id列表、cid列表""" # 正则时注意空白字符,内部应关闭贪婪,否会多去 epList_str = re.findall(r"\"epList\":\s*\[(.*?)\],", response.body.decode(), re.DOTALL)[0] # with open("str.html","w") as f: # f.write(epList_str) ep_id_list = re.findall(r",\"id\":\s*(.*?),", epList_str, re.DOTALL) cid_list = re.findall(r"\"cid\":\s*(.*?),", epList_str, re.DOTALL) # 总共104集 pop_url = "https://comment.bilibili.com/{}.xml" av_url = "https://www.bilibili.com/bangumi/play/ep{}" for cid in cid_list: item = BilibiliItem() item["av_url"] = av_url.format(ep_id_list[cid_list.index(cid)]) url = pop_url.format(cid) yield scrapy.Request(url, callback=self.detail, meta={"item": item})
def parse(self, response): item = BilibiliItem() drama = json.loads(response.text) data = drama['data'] data_list = data['list'] #print(data_list) for filed in data_list: item['number'] = self.i item['badge'] = filed['badge'] item['cover_img'] = filed['cover'] item['index_show'] = filed['index_show'] item['link'] = filed['link'] item['media_id'] = filed['media_id'] item['order_type'] = filed['order_type'] item['season_id'] = filed['season_id'] item['title'] = filed['title'] #print(self.i, item) self.i += 1 yield item
def parse(self, response): path = AbsDirectory.file_path + 'bilibili/bilibili/spiders/tomcat/full/' for x in os.listdir(path): os.remove(path + x) cookie1 = response.headers.getlist('Set-Cookie') cookies = {} for i in cookie1: i = str(i) l1 = i.split(';') l2 = l1[0].split('=') cookies[l2[0]] = l2[1] a = BilibiliItem() #url ='https://ali-video.acfun.cn/mediacloud/acfun/acfun_video/segment/'+self.start_urls[0] a['file_urls'] = [self.start_urls[0]] # if self.start_urls[0] is not None: # yield scrapy.Request(self.start_urls[0], cookies=cookies, callback=self.parse_2) yield scrapy.Request(self.start_urls[0], cookies=cookies, callback=self.parse_2)
def parse1(self, response): item = BilibiliItem() page=Selector(response) try: titles=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/a/text()').extract() urls=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/a/@href').extract() texts=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/div[@class="v-desc"]/text()').extract() peoples=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/div[@class="v-info"]/span[1]/span/text()').extract() danmus=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/div[@class="v-info"]/span[2]/span/text()').extract() class_name = page.xpath('//li[@class="on"]/a/text()').extract() item['class_name'] = class_name[0] for i in range(0,len(titles)): item['title'] = self.cleanInput(titles[i]) item['url'] = urls[i] item['text'] = self.cleanInput(texts[i]) item['people'] = peoples[i] item['danmu'] = danmus[i] yield item except: pass
def next_information_parse(self, response): print "HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH" print response.status if response.status == 403: print "HHHHHHHHHHHHHHHHHHHHHHHHHH page_information_parse HHHHHHHHHHHHHHHHHHHHHHHHHH" time.sleep(60 * 6) yield scrapy.Request(response.url, headers=response.request.headers, meta={"status": 403}) return print response.url self.sum += 1 print "计数 : " + str(self.sum) try: info = json.loads(response.body[38:-1], strict=False) for archive in info["data"]["archives"]: item = BilibiliItem() item["aid"] = archive["aid"] item["cid"] = archive["cid"] item["copyright"] = archive["copyright"] # 版权:1、可以正常转载;2、无法转载 item["tname"] = archive["tname"] # 类别 item["title"] = archive["title"] item["videos"] = archive["videos"] # page数量 item["ctime"] = archive["ctime"] # 创建时间 item["pubdate"] = archive["pubdate"] # 更新时间 item["duration"] = archive["duration"] # 视频时长 item["coin"] = archive["stat"]["coin"] item["favorite"] = archive["stat"]["favorite"] item["likes"] = archive["stat"]["like"] item["archive"] = json.dumps(archive) yield item except Exception as e: print e if "try" not in response.meta: yield scrapy.Request(response.url, callback=self.next_information_parse, meta={"try": 1}, headers=response.request.headers, dont_filter=False)
def parse(self, response): try: sql.delete_requested(response.meta['page']) item = BilibiliItem() jsdict = json.loads(response.text) jsdata = jsdict['data'] item['name_'] = str(jsdata['name']) item['uid'] = jsdata['mid'] item['play_num'] = jsdata['playNum'] item['sex'] = jsdata['sex'] if 'birthday' in jsdata.keys(): item['birthday'] = jsdata['birthday'][5:] else: item['birthday'] = '' if 'place' in jsdata.keys(): item['area'] = jsdata['place'] else: item['area'] = '' if 'regtime' in jsdata.keys(): reg_time = time.localtime(jsdata['regtime']) item['reg_time'] = time.strftime('%Y-%m-%d', reg_time) else: item['reg_time'] = '' item['coins'] = jsdata['coins'] item['article'] = jsdata['article'] item['level_'] = jsdata['level_info']['current_level'] item['exp'] = jsdata['level_info']['current_exp'] item['description'] = jsdata['description'] url = 'http://api.bilibili.com/x/relation/stat?vmid=' + response.meta[ 'page'] + '&jsonp=jsonp' try: data = requests.get(url).text js_fans = json.loads(data) item['following'] = js_fans['data']['following'] item['fans'] = js_fans['data']['follower'] except: pass return item except: print('uid:%d 不存在' % (int(response.meta['page'])))