def parse(self, response): try: j = json.loads( response.xpath("//script[3]/text()").extract()[0] [len('window.__INITIAL_STATE__='):].split(';')[0]) for each in j['rankList']: item = BangumiItem() item['title'] = each['title'] item['cover'] = each['cover'] # item['square_cover'] = each['square_cover'] # item['is_finish'] = each['is_finish'] # item['is_started'] = each['is_started'] item['newest_ep_index'] = each['new_ep']['index_show'] item['data'] = { 'danmaku': each['stat']['danmaku'], 'watch': each['stat']['follow'], 'play': each['stat']['view'], 'pts': each['pts'], 'review': each['video_review'], 'datetime': datetime.datetime.now() } yield item except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}".format(response.url, error), )
def parse(self, response): try: j = json.loads(response.body) if len(j['data']['vlist']) == 0: return channels = j['data']['tlist'] list_channel = [] for each_channel in channels: list_channel.append(channels[each_channel]) aid = [] for each in j['data']['vlist']: aid.append(int(each['aid'])) mid = each['mid'] item = VideoWatcherItem() item['aid'] = aid item['channels'] = list_channel item['mid'] = mid yield item except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: self.task.crawl_count += 1 j = json.loads(response.body) name = j['data']['card']['name'] mid = j['data']['card']['mid'] # 刷新redis数据缓存 self.redis_connection.delete("author_detail::{}".format(mid)) sex = j['data']['card']['sex'] face = j['data']['card']['face'] fans = j['data']['card']['fans'] attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] archive = j['data']['archive_count'] article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) item['name'] = name item['face'] = face item['official'] = official item['sex'] = sex item['level'] = int(level) item['data'] = { 'fans': int(fans), 'attention': int(attention), 'archive': int(archive), 'article': int(article), 'datetime': datetime.datetime.now() } item['c_fans'] = int(fans) item['c_attention'] = int(attention) item['c_archive'] = int(archive) item['c_article'] = int(article) url_list = response.url.split('&') if len(url_list) == 2: item['object_id'] = url_list[1] else: item['object_id'] = None yield Request( "https://api.bilibili.com/x/space/upstat?mid={mid}".format( mid=str(mid)), meta={'item': item}, method='GET', callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 self.task.crawl_failed += 1 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: self.task.crawl_count += 1 video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div') # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 title_list = video_list.xpath('./a/p/text()').extract() watch_list = video_list.xpath('./p/b/text()').extract() author_list = video_list.xpath('./div[1]/a/text()').extract() href_list = video_list.xpath('./a/@href').extract() for i in range(len(title_list)): item = VideoOnline() item['title'] = title_list[i] item['author'] = author_list[i] item['data'] = { 'datetime': datetime.datetime.now(), 'number': watch_list[i] } item['aid'] = href_list[i][9:-1] # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 yield Request("https://www.bilibili.com" + href_list[i], meta={'item': item}, callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 self.task.crawl_failed += 1 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: self.task.crawl_count += 1 url_list = response.xpath( "//*[@id='app']/div[2]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/div/a/@href" ).extract() av_list = response.xpath( '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 for each_url in url_list: yield Request( "https://api.bilibili.com/x/web-interface/card?mid=" + each_url[21:], method='GET', callback=self.detailParse) except Exception as error: self.task.crawl_failed += 1 # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}".format(response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: j = json.loads(response.body) name = j['data']['card']['name'] mid = j['data']['card']['mid'] sex = j['data']['card']['sex'] face = j['data']['card']['face'] fans = j['data']['card']['fans'] attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] archive = j['data']['archive_count'] article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() item['mid'] = int(mid) item['name'] = name item['face'] = face item['official'] = official item['sex'] = sex item['level'] = int(level) item['data'] = { 'fans': int(fans), 'attention': int(attention), 'archive': int(archive), 'article': int(article), 'datetime': datetime.datetime.now() } item['c_fans'] = int(fans) item['c_attention'] = int(attention) item['c_archive'] = int(archive) item['c_article'] = int(article) yield Request( "https://api.bilibili.com/x/space/upstat?mid={mid}".format( mid=str(mid)), meta={'item': item}, method='GET', callback=self.parse_view) except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def detailParse(self, response): try: j = json.loads(response.body) name = j['data']['card']['name'] mid = j['data']['card']['mid'] sex = j['data']['card']['sex'] face = j['data']['card']['face'] fans = j['data']['card']['fans'] attention = j['data']['card']['attention'] level = j['data']['card']['level_info']['current_level'] official = j['data']['card']['Official']['title'] archive = j['data']['archive_count'] article = j['data']['article_count'] face = j['data']['card']['face'] item = AuthorItem() # 粉丝数大于1000才加入 if int(fans) > 1000: item['c_fans'] = int(fans) item['c_attention'] = int(attention) item['c_archive'] = int(archive) item['c_article'] = int(article) item['mid'] = int(mid) item['name'] = name item['face'] = face item['official'] = official item['sex'] = sex item['focus'] = True item['level'] = int(level) item['data'] = { 'fans': int(fans), 'attention': int(attention), 'archive': int(archive), 'article': int(article), 'datetime': datetime.datetime.now() } yield item except Exception as error: # 出现错误时打印错误日志 self.task.crawl_failed += 1 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}".format(response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: r = json.loads(response.body) d = r["data"] item = SiteItem() item['region_count'] = d['region_count'] item['all_count'] = d['all_count'] item['web_online'] = d['web_online'] item['play_online'] = d['play_online'] yield item except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div') # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 href_list = video_list.xpath('./a/@href').extract() for i in range(len(href_list)): # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取 yield Request( "https://api.bilibili.com/x/article/archives?ids=" + href_list[i][9:-1], callback=self.detailParse) except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}".format(response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: url_list = response.xpath( '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract() pts_list = response.xpath( '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract() mid_list = response.xpath( '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract() title_list = response.xpath( '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract() author_list = response.xpath( '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract() aid_list = list(map(lambda x: int(x[27:-1]), url_list)) pts_list = list(map(lambda x: int(x), pts_list)) mid_list = list( map(lambda x: int(x.lstrip('//space.bilibili.com/').rstrip('/')), mid_list)) channel = response.xpath( "//li[@class='active']/text()").extract()[0] # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 for each in zip(title_list, author_list, aid_list, pts_list, mid_list): item = RankItem() item['title'] = each[0] item['author'] = each[1] item['aid'] = each[2] item['pts'] = each[3] item['mid'] = each[4] item['channel'] = channel yield item except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: r = json.loads(response.body) d = r["data"] item = TagItem() item['tag_id'] = d['tag_id'] item['tag_name'] = d['tag_name'] item['ctime'] = d['ctime'] item['use'] = d['count']['use'] item['atten'] = d['count']['atten'] yield item except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: self.task.crawl_count += 1 j = json.loads( response.xpath("//script[3]/text()").extract()[0] [len('window.__INITIAL_STATE__='):].split(';')[0]) for each in j['rankList']: item = BangumiOrDonghuaItem() item['title'] = each['title'] item['cover'] = each['cover'] # item['square_cover'] = each['square_cover'] # item['is_finish'] = each['is_finish'] # item['is_started'] = each['is_started'] item['newest_ep_index'] = each['new_ep']['index_show'] item['data'] = { 'danmaku': each['stat']['danmaku'], 'watch': each['stat']['follow'], 'play': each['stat']['view'], 'pts': each['pts'], 'review': each['video_review'], 'datetime': datetime.datetime.now() } if response.url == 'https://www.bilibili.com/ranking/bangumi/13/0/7': item['collection'] = 'bangumi' elif response.url == 'https://www.bilibili.com/ranking/bangumi/167/0/7': item['collection'] = 'donghua' yield item except Exception as error: # 出现错误时打印错误日志 self.task.crawl_failed += 1 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}".format(response.url, error), )
def parse(self, response): try: j = json.loads(response.body) cards = j['data']['cards'] for each_card in cards: print('点赞数:{}'.format(each_card['desc']['like'])) print('UP主ID:{}'.format(each_card['desc']['uid'])) card = json.loads(each_card['card']) if ('title' in card): print('标题:{}'.format(card['title'])) if ('description' in card): print('内容:{}'.format(card['description'])) except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}".format(response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(response.url) logging.error(error)
def parse(self, response): try: self.task.crawl_count += 1 r = json.loads(response.body) d = r["data"] keys = list(d.keys()) for each_key in keys: aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] mid = d[each_key]['owner']['mid'] view = d[each_key]['stat']['view'] favorite = d[each_key]['stat']['favorite'] danmaku = d[each_key]['stat']['danmaku'] coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] current_date = datetime.now() data = { 'view': view, 'favorite': favorite, 'danmaku': danmaku, 'coin': coin, 'share': share, 'like': like, 'datetime': current_date } subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] tid = d[each_key]['tid'] pic = d[each_key]['pic'] item = VideoItem() item['current_view'] = view item['current_favorite'] = favorite item['current_danmaku'] = danmaku item['current_coin'] = coin item['current_share'] = share item['current_like'] = like item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic item['author'] = author item['data'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date if subChannel != '': item['channel'] = sub_channel_2_channel[subChannel] elif subChannel == '资讯': if tid == 51: item['channel'] == '番剧' if tid == 170: item['channel'] == '国创' if tid == 159: item['channel'] == '娱乐' else: item['channel'] = None url_list = response.url.split('&') if len(url_list) == 2: item['object_id'] = url_list[1] else: item['object_id'] = None yield item except Exception as error: # 出现错误时打印错误日志 self.task.crawl_failed += 1 if r['code'] == -404: return mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error)
def parse(self, response): try: if response.status == 404: return r = json.loads(response.body) for each in r: aid = each['aid'] author = each['name'] mid = each['mid'] view = each['playTotal'] favorite = each['favoritesTotal'] danmaku = each['danmakuTotal'] coin = None share = None like = None date = response.meta['date'] date_str = '{}-{}-{}'.format(date[:4], date[4:6], date[6:8]) current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d") data = { 'view': view, 'favorite': favorite, 'danmaku': danmaku, 'coin': coin, 'share': share, 'like': like, 'datetime': current_date } subChannel = None tid = None title = each['title'] date = each['created'] pic = 'http:' + each['pic'] item = VideoItem() item['current_view'] = view item['current_favorite'] = favorite item['current_danmaku'] = danmaku item['current_coin'] = coin item['current_share'] = share item['current_like'] = like item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic item['author'] = author item['data'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date if author == '腾讯动漫' or author == '哔哩哔哩番剧': continue self.coll.find_one({'aid': aid}) d = self.coll.find_one({'aid': aid}) flag = 0 if d != None and 'data' in d: if 'subChannel' in d: item['subChannel'] = d['subChannel'] if 'channel' in d: item['channel'] = d['channel'] for each_data in d['data']: data_date = each_data['datetime'].strftime("%Y-%m-%d") if data_date == date_str: flag = 1 break if flag == 0: yield item except Exception as error: # 出现错误时打印错误日志 mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error)
def detailParse(self, response): try: r = json.loads(response.body) d = r["data"] keys = list(d.keys()) for each_key in keys: aid = d[each_key]['stat']['aid'] author = d[each_key]['owner']['name'] mid = d[each_key]['owner']['mid'] view = d[each_key]['stat']['view'] favorite = d[each_key]['stat']['favorite'] danmaku = d[each_key]['stat']['danmaku'] coin = d[each_key]['stat']['coin'] share = d[each_key]['stat']['share'] like = d[each_key]['stat']['like'] current_date = datetime.datetime.now() data = { 'view': view, 'favorite': favorite, 'danmaku': danmaku, 'coin': coin, 'share': share, 'like': like, 'datetime': current_date } subChannel = d[each_key]['tname'] title = d[each_key]['title'] date = d[each_key]['pubdate'] tid = d[each_key]['tid'] pic = d[each_key]['pic'] item = VideoAndAuthorItem() item['current_view'] = view item['current_favorite'] = favorite item['current_danmaku'] = danmaku item['current_coin'] = coin item['current_share'] = share item['current_like'] = like item['current_datetime'] = current_date item['aid'] = aid item['mid'] = mid item['pic'] = pic item['author'] = author item['data_video'] = data item['title'] = title item['subChannel'] = subChannel item['datetime'] = date if subChannel != '': item['channel'] = sub_channel_2_channel[subChannel] elif subChannel == '资讯': if tid == 51: item['channel'] == '番剧' if tid == 170: item['channel'] == '国创' if tid == 159: item['channel'] == '娱乐' else: item['channel'] = None yield Request( "https://api.bilibili.com/x/web-interface/card?mid=" + str(mid), meta={'item': item}, method='GET', callback=self.authorParse) except Exception as error: # 出现错误时打印错误日志 if r['code'] == -404: return mailer.send( to=["*****@*****.**"], subject="BiliobSpiderError", body="{}\n{}\n{}".format(item, response.url, error), ) logging.error("视频爬虫在解析时发生错误") logging.error(item) logging.error(response.url) logging.error(error)