コード例 #1
0
 def parse(self, response):
     try:
         j = json.loads(
             response.xpath("//script[3]/text()").extract()[0]
             [len('window.__INITIAL_STATE__='):].split(';')[0])
         for each in j['rankList']:
             item = BangumiItem()
             item['title'] = each['title']
             item['cover'] = each['cover']
             # item['square_cover'] = each['square_cover']
             # item['is_finish'] = each['is_finish']
             # item['is_started'] = each['is_started']
             item['newest_ep_index'] = each['new_ep']['index_show']
             item['data'] = {
                 'danmaku': each['stat']['danmaku'],
                 'watch': each['stat']['follow'],
                 'play': each['stat']['view'],
                 'pts': each['pts'],
                 'review': each['video_review'],
                 'datetime': datetime.datetime.now()
             }
             yield item
     except Exception as error:
         # 出现错误时打印错误日志
         mailer.send(
             to=["*****@*****.**"],
             subject="BiliobSpiderError",
             body="{}\n{}".format(response.url, error),
         )
コード例 #2
0
 def parse(self, response):
     try:
         j = json.loads(response.body)
         if len(j['data']['vlist']) == 0:
             return
         channels = j['data']['tlist']
         list_channel = []
         for each_channel in channels:
             list_channel.append(channels[each_channel])
         aid = []
         for each in j['data']['vlist']:
             aid.append(int(each['aid']))
             mid = each['mid']
         item = VideoWatcherItem()
         item['aid'] = aid
         item['channels'] = list_channel
         item['mid'] = mid
         yield item
     except Exception as error:
         # 出现错误时打印错误日志
         mailer.send(
             to=["*****@*****.**"],
             subject="BiliobSpiderError",
             body="{}\n{}\n{}".format(item, response.url, error),
         )
         logging.error("视频爬虫在解析时发生错误")
         logging.error(response.url)
         logging.error(error)
コード例 #3
0
    def parse(self, response):
        try:
            self.task.crawl_count += 1
            j = json.loads(response.body)
            name = j['data']['card']['name']
            mid = j['data']['card']['mid']

            # 刷新redis数据缓存
            self.redis_connection.delete("author_detail::{}".format(mid))

            sex = j['data']['card']['sex']
            face = j['data']['card']['face']
            fans = j['data']['card']['fans']
            attention = j['data']['card']['attention']
            level = j['data']['card']['level_info']['current_level']
            official = j['data']['card']['Official']['title']
            archive = j['data']['archive_count']
            article = j['data']['article_count']
            face = j['data']['card']['face']
            item = AuthorItem()
            item['mid'] = int(mid)
            item['name'] = name
            item['face'] = face
            item['official'] = official
            item['sex'] = sex
            item['level'] = int(level)
            item['data'] = {
                'fans': int(fans),
                'attention': int(attention),
                'archive': int(archive),
                'article': int(article),
                'datetime': datetime.datetime.now()
            }
            item['c_fans'] = int(fans)
            item['c_attention'] = int(attention)
            item['c_archive'] = int(archive)
            item['c_article'] = int(article)

            url_list = response.url.split('&')
            if len(url_list) == 2:
                item['object_id'] = url_list[1]
            else:
                item['object_id'] = None
            yield Request(
                "https://api.bilibili.com/x/space/upstat?mid={mid}".format(
                    mid=str(mid)),
                meta={'item': item},
                method='GET',
                callback=self.parse_view)
        except Exception as error:
            # 出现错误时打印错误日志
            self.task.crawl_failed += 1
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
コード例 #4
0
    def parse(self, response):
        try:
            self.task.crawl_count += 1
            video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div')

            # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取
            title_list = video_list.xpath('./a/p/text()').extract()
            watch_list = video_list.xpath('./p/b/text()').extract()
            author_list = video_list.xpath('./div[1]/a/text()').extract()
            href_list = video_list.xpath('./a/@href').extract()
            for i in range(len(title_list)):
                item = VideoOnline()
                item['title'] = title_list[i]
                item['author'] = author_list[i]
                item['data'] = {
                    'datetime': datetime.datetime.now(),
                    'number': watch_list[i]
                }
                item['aid'] = href_list[i][9:-1]
                # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取
                yield Request("https://www.bilibili.com" + href_list[i],
                              meta={'item': item},
                              callback=self.detailParse)
        except Exception as error:
            # 出现错误时打印错误日志
            self.task.crawl_failed += 1
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
コード例 #5
0
    def parse(self, response):
        try:
            self.task.crawl_count += 1
            url_list = response.xpath(
                "//*[@id='app']/div[2]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/div/a/@href"
            ).extract()
            av_list = response.xpath(
                '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract()

            # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取
            for each_url in url_list:
                yield Request(
                    "https://api.bilibili.com/x/web-interface/card?mid=" +
                    each_url[21:],
                    method='GET',
                    callback=self.detailParse)
        except Exception as error:
            self.task.crawl_failed += 1
            # 出现错误时打印错误日志
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}".format(response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
コード例 #6
0
 def parse(self, response):
     try:
         j = json.loads(response.body)
         name = j['data']['card']['name']
         mid = j['data']['card']['mid']
         sex = j['data']['card']['sex']
         face = j['data']['card']['face']
         fans = j['data']['card']['fans']
         attention = j['data']['card']['attention']
         level = j['data']['card']['level_info']['current_level']
         official = j['data']['card']['Official']['title']
         archive = j['data']['archive_count']
         article = j['data']['article_count']
         face = j['data']['card']['face']
         item = AuthorItem()
         item['mid'] = int(mid)
         item['name'] = name
         item['face'] = face
         item['official'] = official
         item['sex'] = sex
         item['level'] = int(level)
         item['data'] = {
             'fans': int(fans),
             'attention': int(attention),
             'archive': int(archive),
             'article': int(article),
             'datetime': datetime.datetime.now()
         }
         item['c_fans'] = int(fans)
         item['c_attention'] = int(attention)
         item['c_archive'] = int(archive)
         item['c_article'] = int(article)
         yield Request(
             "https://api.bilibili.com/x/space/upstat?mid={mid}".format(
                 mid=str(mid)),
             meta={'item': item},
             method='GET',
             callback=self.parse_view)
     except Exception as error:
         # 出现错误时打印错误日志
         mailer.send(
             to=["*****@*****.**"],
             subject="BiliobSpiderError",
             body="{}\n{}\n{}".format(item, response.url, error),
         )
         logging.error("视频爬虫在解析时发生错误")
         logging.error(response.url)
         logging.error(error)
コード例 #7
0
    def detailParse(self, response):
        try:
            j = json.loads(response.body)
            name = j['data']['card']['name']
            mid = j['data']['card']['mid']
            sex = j['data']['card']['sex']
            face = j['data']['card']['face']
            fans = j['data']['card']['fans']
            attention = j['data']['card']['attention']
            level = j['data']['card']['level_info']['current_level']
            official = j['data']['card']['Official']['title']
            archive = j['data']['archive_count']
            article = j['data']['article_count']
            face = j['data']['card']['face']
            item = AuthorItem()

            # 粉丝数大于1000才加入
            if int(fans) > 1000:
                item['c_fans'] = int(fans)
                item['c_attention'] = int(attention)
                item['c_archive'] = int(archive)
                item['c_article'] = int(article)
                item['mid'] = int(mid)
                item['name'] = name
                item['face'] = face
                item['official'] = official
                item['sex'] = sex
                item['focus'] = True
                item['level'] = int(level)
                item['data'] = {
                    'fans': int(fans),
                    'attention': int(attention),
                    'archive': int(archive),
                    'article': int(article),
                    'datetime': datetime.datetime.now()
                }
                yield item
        except Exception as error:
            # 出现错误时打印错误日志
            self.task.crawl_failed += 1
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}".format(response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
コード例 #8
0
    def parse(self, response):
        try:
            r = json.loads(response.body)
            d = r["data"]
            item = SiteItem()
            item['region_count'] = d['region_count']
            item['all_count'] = d['all_count']
            item['web_online'] = d['web_online']
            item['play_online'] = d['play_online']
            yield item

        except Exception as error:
            # 出现错误时打印错误日志
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
コード例 #9
0
 def parse(self, response):
     try:
         video_list = response.xpath('//*[@id="app"]/div[2]/div[2]/div')
         # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取
         href_list = video_list.xpath('./a/@href').extract()
         for i in range(len(href_list)):
             # 为了爬取分区等数据,需要进入每一个视频的详情页面进行抓取
             yield Request(
                 "https://api.bilibili.com/x/article/archives?ids=" +
                 href_list[i][9:-1],
                 callback=self.detailParse)
     except Exception as error:
         # 出现错误时打印错误日志
         mailer.send(
             to=["*****@*****.**"],
             subject="BiliobSpiderError",
             body="{}\n{}".format(response.url, error),
         )
         logging.error("视频爬虫在解析时发生错误")
         logging.error(response.url)
         logging.error(error)
コード例 #10
0
    def parse(self, response):
        try:
            url_list = response.xpath(
                '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/@href').extract()
            pts_list = response.xpath(
                '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[2]/div/text()').extract()
            mid_list = response.xpath(
                '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/@href').extract()

            title_list = response.xpath(
                '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()').extract()
            author_list = response.xpath(
                '//*[@id="app"]/div[2]/div/div/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()').extract()
            aid_list = list(map(lambda x: int(x[27:-1]), url_list))
            pts_list = list(map(lambda x: int(x), pts_list))
            mid_list = list(
                map(lambda x: int(x.lstrip('//space.bilibili.com/').rstrip('/')), mid_list))
            channel = response.xpath(
                "//li[@class='active']/text()").extract()[0]
            # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取
            for each in zip(title_list, author_list, aid_list, pts_list, mid_list):
                item = RankItem()
                item['title'] = each[0]
                item['author'] = each[1]
                item['aid'] = each[2]
                item['pts'] = each[3]
                item['mid'] = each[4]
                item['channel'] = channel
                yield item
        except Exception as error:
            # 出现错误时打印错误日志
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
コード例 #11
0
ファイル: tag.py プロジェクト: z464244404/biliob-spider
    def parse(self, response):
        try:
            r = json.loads(response.body)
            d = r["data"]
            item = TagItem()
            item['tag_id'] = d['tag_id']
            item['tag_name'] = d['tag_name']
            item['ctime'] = d['ctime']
            item['use'] = d['count']['use']
            item['atten'] = d['count']['atten']
            yield item

        except Exception as error:
            # 出现错误时打印错误日志
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
コード例 #12
0
    def parse(self, response):
        try:
            self.task.crawl_count += 1
            j = json.loads(
                response.xpath("//script[3]/text()").extract()[0]
                [len('window.__INITIAL_STATE__='):].split(';')[0])
            for each in j['rankList']:
                item = BangumiOrDonghuaItem()
                item['title'] = each['title']
                item['cover'] = each['cover']
                # item['square_cover'] = each['square_cover']
                # item['is_finish'] = each['is_finish']
                # item['is_started'] = each['is_started']
                item['newest_ep_index'] = each['new_ep']['index_show']
                item['data'] = {
                    'danmaku': each['stat']['danmaku'],
                    'watch': each['stat']['follow'],
                    'play': each['stat']['view'],
                    'pts': each['pts'],
                    'review': each['video_review'],
                    'datetime': datetime.datetime.now()
                }

                if response.url == 'https://www.bilibili.com/ranking/bangumi/13/0/7':
                    item['collection'] = 'bangumi'
                elif response.url == 'https://www.bilibili.com/ranking/bangumi/167/0/7':
                    item['collection'] = 'donghua'
                yield item
        except Exception as error:
            # 出现错误时打印错误日志
            self.task.crawl_failed += 1
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}".format(response.url, error),
            )
コード例 #13
0
    def parse(self, response):
        try:
            j = json.loads(response.body)
            cards = j['data']['cards']
            for each_card in cards:

                print('点赞数:{}'.format(each_card['desc']['like']))
                print('UP主ID:{}'.format(each_card['desc']['uid']))
                card = json.loads(each_card['card'])
                if ('title' in card):
                    print('标题:{}'.format(card['title']))
                if ('description' in card):
                    print('内容:{}'.format(card['description']))

        except Exception as error:
            # 出现错误时打印错误日志
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}".format(response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
コード例 #14
0
    def parse(self, response):
        try:
            self.task.crawl_count += 1
            r = json.loads(response.body)
            d = r["data"]
            keys = list(d.keys())
            for each_key in keys:

                aid = d[each_key]['stat']['aid']
                author = d[each_key]['owner']['name']
                mid = d[each_key]['owner']['mid']
                view = d[each_key]['stat']['view']
                favorite = d[each_key]['stat']['favorite']
                danmaku = d[each_key]['stat']['danmaku']
                coin = d[each_key]['stat']['coin']
                share = d[each_key]['stat']['share']
                like = d[each_key]['stat']['like']
                current_date = datetime.now()
                data = {
                    'view': view,
                    'favorite': favorite,
                    'danmaku': danmaku,
                    'coin': coin,
                    'share': share,
                    'like': like,
                    'datetime': current_date
                }

                subChannel = d[each_key]['tname']
                title = d[each_key]['title']
                date = d[each_key]['pubdate']
                tid = d[each_key]['tid']
                pic = d[each_key]['pic']
                item = VideoItem()
                item['current_view'] = view
                item['current_favorite'] = favorite
                item['current_danmaku'] = danmaku
                item['current_coin'] = coin
                item['current_share'] = share
                item['current_like'] = like
                item['current_datetime'] = current_date
                item['aid'] = aid
                item['mid'] = mid
                item['pic'] = pic
                item['author'] = author
                item['data'] = data
                item['title'] = title
                item['subChannel'] = subChannel
                item['datetime'] = date

                if subChannel != '':
                    item['channel'] = sub_channel_2_channel[subChannel]
                elif subChannel == '资讯':
                    if tid == 51:
                        item['channel'] == '番剧'
                    if tid == 170:
                        item['channel'] == '国创'
                    if tid == 159:
                        item['channel'] == '娱乐'
                else:
                    item['channel'] = None

                url_list = response.url.split('&')
                if len(url_list) == 2:
                    item['object_id'] = url_list[1]
                else:
                    item['object_id'] = None
                yield item

        except Exception as error:
            # 出现错误时打印错误日志
            self.task.crawl_failed += 1
            if r['code'] == -404:
                return
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(item)
            logging.error(response.url)
            logging.error(error)
コード例 #15
0
    def parse(self, response):
        try:
            if response.status == 404:
                return
            r = json.loads(response.body)
            for each in r:
                aid = each['aid']
                author = each['name']
                mid = each['mid']
                view = each['playTotal']
                favorite = each['favoritesTotal']
                danmaku = each['danmakuTotal']
                coin = None
                share = None
                like = None
                date = response.meta['date']
                date_str = '{}-{}-{}'.format(date[:4], date[4:6], date[6:8])
                current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")

                data = {
                    'view': view,
                    'favorite': favorite,
                    'danmaku': danmaku,
                    'coin': coin,
                    'share': share,
                    'like': like,
                    'datetime': current_date
                }

                subChannel = None
                tid = None
                title = each['title']
                date = each['created']
                pic = 'http:' + each['pic']
                item = VideoItem()
                item['current_view'] = view
                item['current_favorite'] = favorite
                item['current_danmaku'] = danmaku
                item['current_coin'] = coin
                item['current_share'] = share
                item['current_like'] = like
                item['current_datetime'] = current_date
                item['aid'] = aid
                item['mid'] = mid
                item['pic'] = pic
                item['author'] = author
                item['data'] = data
                item['title'] = title
                item['subChannel'] = subChannel
                item['datetime'] = date
                if author == '腾讯动漫' or author == '哔哩哔哩番剧':
                    continue
                self.coll.find_one({'aid': aid})
                d = self.coll.find_one({'aid': aid})
                flag = 0
                if d != None and 'data' in d:
                    if 'subChannel' in d:
                        item['subChannel'] = d['subChannel']
                    if 'channel' in d:
                        item['channel'] = d['channel']
                    for each_data in d['data']:
                        data_date = each_data['datetime'].strftime("%Y-%m-%d")
                        if data_date == date_str:
                            flag = 1
                            break
                if flag == 0:
                    yield item

        except Exception as error:
            # 出现错误时打印错误日志

            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(item)
            logging.error(response.url)
            logging.error(error)
コード例 #16
0
    def detailParse(self, response):
        try:
            r = json.loads(response.body)
            d = r["data"]
            keys = list(d.keys())
            for each_key in keys:

                aid = d[each_key]['stat']['aid']
                author = d[each_key]['owner']['name']
                mid = d[each_key]['owner']['mid']
                view = d[each_key]['stat']['view']
                favorite = d[each_key]['stat']['favorite']
                danmaku = d[each_key]['stat']['danmaku']
                coin = d[each_key]['stat']['coin']
                share = d[each_key]['stat']['share']
                like = d[each_key]['stat']['like']
                current_date = datetime.datetime.now()
                data = {
                    'view': view,
                    'favorite': favorite,
                    'danmaku': danmaku,
                    'coin': coin,
                    'share': share,
                    'like': like,
                    'datetime': current_date
                }

                subChannel = d[each_key]['tname']
                title = d[each_key]['title']
                date = d[each_key]['pubdate']
                tid = d[each_key]['tid']
                pic = d[each_key]['pic']
                item = VideoAndAuthorItem()
                item['current_view'] = view
                item['current_favorite'] = favorite
                item['current_danmaku'] = danmaku
                item['current_coin'] = coin
                item['current_share'] = share
                item['current_like'] = like
                item['current_datetime'] = current_date
                item['aid'] = aid
                item['mid'] = mid
                item['pic'] = pic
                item['author'] = author
                item['data_video'] = data
                item['title'] = title
                item['subChannel'] = subChannel
                item['datetime'] = date

                if subChannel != '':
                    item['channel'] = sub_channel_2_channel[subChannel]
                elif subChannel == '资讯':
                    if tid == 51:
                        item['channel'] == '番剧'
                    if tid == 170:
                        item['channel'] == '国创'
                    if tid == 159:
                        item['channel'] == '娱乐'
                else:
                    item['channel'] = None
                yield Request(
                    "https://api.bilibili.com/x/web-interface/card?mid=" +
                    str(mid),
                    meta={'item': item},
                    method='GET',
                    callback=self.authorParse)

        except Exception as error:
            # 出现错误时打印错误日志
            if r['code'] == -404:
                return
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(item)
            logging.error(response.url)
            logging.error(error)