Ejemplo n.º 1
0
    def parse(self, response):
        global title
        print(self.start)
        if len(self.start) > 1:
            print('from socket')
            items = BilibiliItem()

            items['file_urls'] = self.start[0:len(self.start) - 1]

            title = self.start[len(self.start) - 1]
            print(title, 'parse')
            yield items
        else:
            print('from json')

            with open(AbsDirectory.file_path +
                      'bilibili/bilibili/spiders/tomcat/long/long.json',
                      'r',
                      encoding='utf-8') as f:
                json_list = json.load(f)
                print(json_list)
                print(json_list[len(json_list) - 1])
                items = BilibiliItem()
                json_list.append('end')
                items['file_urls'] = json_list[0:len(json_list) - 1]
                f.close()
                os.remove(AbsDirectory.file_path +
                          'bilibili/bilibili/spiders/tomcat/long/long.json')
                Store.file_length = len(json_list)
                title = self.start[0]
                global urls

                urls = json_list

                yield items
Ejemplo n.º 2
0
    def parse(self, response):

        final = self.get_video_url(response)
        logging.warning(response.headers)
        items = BilibiliItem()
        items['file_urls'] = [final]
        items_audio = BilibiliItem()
        items_audio['file_urls'] = [self.get_audio_url(response)]
        yield items
        yield items_audio
Ejemplo n.º 3
0
	def parse_up(self, response):
		user = json.loads(response.text)
		if not user['status']:
			return
		user = user['data']
		item = BilibiliItem()
		item['uid'] = user['mid']
		item['name'] = user['name']
		item['space'] = 'https://space.bilibili.com/' + item['uid']
		item['sex'] = user['sex']
		try:
			item['birthday'] = user['birthday'][-5:]
		except KeyError:
			item['birthday'] = ''
		try:
			item['address'] = user['place']
		except KeyError:
			item['address'] = ''
		item['level'] = user['level_info']['current_level']
		try:
			t = time.localtime(user['regtime'])
			item['regtime'] = time.strftime('%Y-%m-%d',t)
		except KeyError:
			item['regtime'] = ''
		item['fans'] = user['fans']
		item['follows'] = user['attention']
		item['playnum'] = user['playNum']

		url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=' + item['uid']
		yield Request(url, callback=self.parse_video, meta={'userdata': item})
Ejemplo n.º 4
0
    def parse_2(self, response):
        global title

        title_l = response.xpath("//h1//text()").extract()
        if len(title_l) > 1:
            title = response.xpath("//h1//text()").extract()[len(title_l) - 1]
        else:
            title = title_l[0]
        urls = response.text.split('\\\"1080P60\\\"')
        if len(urls) == 1:
            urls = response.text.split('\\\"1080P\\\"')
            if len(urls) == 1:
                urls = response.text.split('\\\"超清\\\"')
                if len(urls) == 1:
                    urls = response.text.split('\\\"高清\\\"')
                    if len(urls) == 1:
                        urls = response.text.split('\\\"标清\\\"')

        t = urls[1]
        raw_url = self.get_str(t, '[', ']')
        logging.warning(raw_url)
        a = BilibiliItem()
        # url ='https://ali-video.acfun.cn/mediacloud/acfun/acfun_video/segment/'+self.start_urls[0]
        a['file_urls'] = [raw_url]
        yield a
Ejemplo n.º 5
0
 def parse_page(self, response):
     av = response.xpath(".//span[@class='type avid']/text()").get()
     print(response.request.headers['User-Agent'])
     print(av)
     if av == None:
         return
     try:
         href = response.xpath(
             "/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[1]/div/div[1]/a/@href"
         ).get()
         date = response.xpath(
             "/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[1]/div/div[3]/span[3]/text()"
         ).get()
         date = date.strip()
         # print(href)(/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[1]/div/div[1]/a)
         # time = response.xpath("/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li/a/div/span[1]/text()").get()
         time = response.xpath(
             "/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[1]/a/div/span[1]/text()"
         ).get()
         # print(time)
         item = BilibiliItem()
         item['time'] = time
         item['date'] = date
         r = re.findall('BV[0-9a-zA-Z]*', href)
         url2 = "https://api.bilibili.com/x/web-interface/view?&bvid=" + "".join(
             r)
         print(url2)
         yield scrapy.Request(url2,
                              callback=self.parse,
                              dont_filter=True,
                              meta={"item": item})
     except:
         return
Ejemplo n.º 6
0
    def parse_detail(self, response):
        item = BilibiliItem()
        item_brief_list=['badge','badge_type','is_finish','media_id','index_show','season_id','title']
        item_order_list=['follow','play','pub_date','pub_real_time','renewal_time','score']
        m=response.meta
        for key in item_brief_list:
            if (key in m):
                item[key]=m[key]
            else:
                item[key]=""
        for key in item_order_list:
            if (key in m['order']):
                item[key]=m['order'][key]
            else:
                item[key]=""
        tags=response.xpath('//*[@class="media-tag"]/text()').extract()
        tags_string=''
        for t in tags:
            tags_string=tags_string+" "+t
        item['tags']=tags_string
        item['brief'] = response.xpath('//*[@name="description"]/attribute::content').extract()
        detail_text = response.xpath('//script')[4].extract()
        actor_p = re.compile('actors":(.*?),')
        ratings_count_p = re.compile('count":(.*?),')
        staff_p = re.compile('staff":(.*?),')
        item['cv'] = re.findall(actor_p,detail_text)[0]
        item['staff'] = re.findall(staff_p,detail_text)[0]
        count_list=re.findall(ratings_count_p,detail_text)
        if(len(count_list)>0):
            item['count'] = count_list[0]
        else:
            item['count']=0
#        self.log(item)
        return item
Ejemplo n.º 7
0
 def parse(self, response):
     for x in os.listdir(AbsDirectory.file_path+'bilibili/bilibili/spiders/tomcat/full/'):
         os.remove(AbsDirectory.file_path+'bilibili/bilibili/spiders/tomcat/full/'+x)
     b = BilibiliItem()
     b['file_urls'] = ['https://www.acfun.cn/?pagelets=pagelet_game,pagelet_douga,pagelet_bangumi_list,pagelet_life,'
                      'pagelet_tech,pagelet_dance,pagelet_music,pagelet_film,pagelet_fishpond,pagelet_s'
                      'port&reqID=0&ajaxpipe=1&t=1582458727656']
     yield b
Ejemplo n.º 8
0
 def parse(self, response):
     for x in os.listdir(AbsDirectory.file_path +
                         'bilibili/bilibili/spiders/tomcat/full/'):
         os.remove(AbsDirectory.file_path +
                   'bilibili/bilibili/spiders/tomcat/full/' + x)
     b = BilibiliItem()
     url_info = 'https://www.acfun.cn/u/{}?quickViewId=ac-space-video-list&reqID=1&a' \
                   'jaxpipe=1&type=video&order=newest&page={}&pageSize=20&t=1587549164677'.\
         format(self.up_id, self.page_no)
     b['file_urls'] = [url_info]
     yield b
Ejemplo n.º 9
0
        def parse_video(self, response):
            item = BilibiliItem()

            try:
                soup = BeautifulSoup(response.body)
                scriptText = soup.findAll('script')[3].get_text()
            except:
                print('soup里找不到东西')
            else:
                #title
                titles = re.findall(r'"title":".+?"', scriptText)
                title = titles[0].replace('"title":', '').replace('"', '')
                item['title'] = title
                #related
                relatedArr = []
                for rawRelateTitle in titles[2:]:
                    relatedtitle = rawRelateTitle.replace('"title":',
                                                          '').replace('"', '')
                    relatedArr.append(relatedtitle)
                item['related'] = relatedArr
                #aid
                aids = re.findall(r'"aid":\d+', scriptText)
                item['av'] = aids[0].replace('"aid":', '')
                #bvid
                bvids = re.findall(r'"bvid":"BV.+?"', scriptText)
                item['bv'] = bvids[0].replace('"bvid":', '').replace('"', '')
                #pic
                pics = re.findall(r'"pic":".+?"', scriptText)
                pic = pics[0].replace('"pic":', '').replace('"', '')
                item['pic'] = pic.encode('latin-1').decode('unicode_escape')
                #desc
                descs = re.findall(r'"desc":".+?"', scriptText)
                desc = descs[0].replace('"desc":', '').replace('"', '')
                item['desc'] = desc
                #partsDict
                matchLst = re.findall(r'"part":".*?"', scriptText)
                if matchLst != []:
                    titleDic = {}
                    for titleTxt, count in zip(matchLst, range(len(matchLst))):
                        title = titleTxt.replace('"', '').replace('part:', '')
                        titleDic[count] = title
                    #此处yield的东西会出现在pipeline中
                    item['partsDict'] = titleDic
                #pubdate
                pubdate = re.findall(r'"pubdate":\d+', scriptText)
                item['pubdate'] = pubdate[0].replace('"pubdate":', '')
                #viewseo
                viewseo = re.findall(r'"viewseo":\d+', scriptText)
                item['viewseo'] = viewseo[0].replace('"viewseo":', '')
                #numOfComments
                numOfComments = re.findall(r'"reply":\d+', scriptText)
                item['numOfComments'] = numOfComments[0].replace(
                    '"reply":', '')
            yield item
Ejemplo n.º 10
0
    def parse_data(self, response):
        # print(response.text)
        get_data = lambda x: re.findall(
            "\d+",
            response.xpath(x)[0].attrib.get('title'))[0] if len(
                re.findall("\d+",
                           response.xpath(x)[0].attrib.get('title'))
            ) != 0 else 0
        get_subscriber = lambda x: response.xpath(x).extract()[0] if len(
            response.xpath(x).extract()) != 0 else 0
        item = BilibiliItem()
        item['title'] = str(
            response.xpath('''//*[@id="viewbox_report"]/h1/span/text()''').
            extract()[0]).replace("-",
                                  "").replace(r"\n",
                                              "").replace("\n",
                                                          "").replace(" ", "")
        item['play_num'] = str(
            get_data('''//*[@id="viewbox_report"]/div[2]/span[1]''')).replace(
                "-", "").replace(r"\n", "").replace("\n", "").replace(" ", "")
        item['bullet_screen'] = str(
            get_data('''//*[@id="viewbox_report"]/div[2]/span[2]''')).replace(
                "-", "").replace(r"\n", "").replace("\n", "").replace(" ", "")
        # item['like'] = response.xpath('''//*[@id="arc_toolbar_report"]/div[1]/span[1]/text()''')
        item['like'] = str(
            get_data(
                '''//*[@id="arc_toolbar_report"]/div[1]/span[1]''')).replace(
                    "-", "").replace(r"\n", "").replace("\n",
                                                        "").replace(" ", "")
        item['coin'] = str(
            get_subscriber(
                '''//*[@id="arc_toolbar_report"]/div[1]/span[2]/text()''')
        ).replace("-", "").replace(r"\n", "").replace("\n",
                                                      "").replace(" ", "")
        item['collect'] = str(
            get_subscriber(
                '''//*[@id="arc_toolbar_report"]/div[1]/span[3]/text()''')
        ).replace("-", "").replace(r"\n", "").replace("\n",
                                                      "").replace(" ", "")
        # item['subscriber'] = response.xpath('''//*[@id="v_upinfo"]/div[3]/div[2]/span/span/text()''').extract()[0]
        item['subscriber'] = str(
            get_subscriber(
                '''//*[@id="v_upinfo"]/div[3]/div[2]/span/span/text()''')
        ).replace("-", "").replace(r"\n", "").replace("\n",
                                                      "").replace(" ", "")
        item['author'] = str(
            response.xpath('''//*[@id="v_upinfo"]/div[2]/div/a[1]/text()''').
            extract()[0]).replace("-",
                                  "").replace(r"\n",
                                              "").replace("\n",
                                                          "").replace(" ", "")
        item['av_num'] = 0

        return item
Ejemplo n.º 11
0
 def parse_up_url(self, response):
     global totalPage
     totalPage = response.xpath(
         '//div[@class="ac-space-contribute-list"]//ul//li[@class="active"]//@data-count'
     ).extract()[0]
     totalPage = int(int(totalPage) / 20) + 1
     s = 'https://www.acfun.cn/u/{}?quickViewId=ac-space-video-list&reqID=4&ajaxpipe=' \
         '1&type=video&order=newest&page={}&pageSize=20&t=1587619209806'.format(up_id, 1)
     b = BilibiliItem()
     b['file_urls'] = [s]
     print(totalPage)
     yield b
Ejemplo n.º 12
0
 def open_spider(self, spider):
     if not os.path.exists("spiders/yeyinfuStat"):
         os.mkdir("spiders/yeyinfuStat")
     self.f = open('spiders/yeyinfuStat/YeyinfuVideoTitles.txt', 'w')
     self.csvf = open('spiders/yeyinfuStat/YeyinfuVideoTitles.csv', 'w')
     self.itemf = open('spiders/yeyinfuStat/YeyinfuItems.csv', 'w')
     try:
         itemWriter = csv.writer(self.itemf)
         header = list(BilibiliItem().fields.keys())
         itemWriter.writerow(header)
     except:
         pass
Ejemplo n.º 13
0
 def parse(self, response):
     rank_list = response.xpath('..//li[@class="rank-item"]')
     print("-----")
     for rank_item in rank_list:
         item = BilibiliItem()
         item['title'] = rank_item.xpath(
             './div[@class="content"]/div[@class="info"]/a/text()'
         )[0].extract()
         item['href'] = rank_item.xpath(
             './div[@class="content"]/div[@class="info"]/a/@href'
         )[0].extract()
         yield item
     pass
Ejemplo n.º 14
0
 def bilibili_parse(self, response):
     item = BilibiliItem()
     r = json.loads(response.body)
     if response.status == 200 and r['status'] != False:
         data = r['data']
         for key in item.fields:
             if key == 'currentLevel':
                 item[key] = data['level_info']['current_level']
             else:
                 item[key] = data[key] if key in data.keys() else 'no'
         yield item
     else:
         return
Ejemplo n.º 15
0
 def parse_userinfo(self, response):
     '''
     爬取用户的mid name sex信息
     '''
     userinfo_resultJson = json.loads(response.body)
     userinfo_result = userinfo_resultJson['data']
     item = BilibiliItem()
     item['_id'] = userinfo_result['mid']
     item['name'] = userinfo_result['name']
     item['level'] = userinfo_result['level']
     item['coins'] = userinfo_result['coins']
     item['sex'] = userinfo_result['sex']
     yield item
Ejemplo n.º 16
0
 def parse(self, response):
    item =BilibiliItem()
    list = response.xpath('//ul[@class="rank-list"]/li')
    print("len:",len(list))
    for i in list:
        item["number"] = i.xpath('./div[@class="num"]/text()').extract()[0]
        print(type(item["number"]))
        item["title"] = i.xpath('.//div[@class="info"]/a/text()').extract()[0]
        url = i.xpath('.//div[@class="info"]/a/@href').extract()[0]
        item["url"] = url.split("//")[-1]
        item["grade"] = i.xpath('.//div[@class="pts"]/div/text()').extract()[0]
        item["play_number"] = i.xpath('.//div[@class="detail"]/span[1]/text()').extract()[0]
        item["comments"] = i.xpath('.//div[@class="detail"]/span[2]/text()').extract()[0]
        yield item
Ejemplo n.º 17
0
 def next(self, response):
     print('请求成功')
     a = response.body.decode('utf-8', 'ignore')
     item = BilibiliItem()
     print(len(a))
     pat1 = '"uname":"(.*?)"'
     pat2 = '"sex":"(.*?)"'
     pat3 = '"sign":"(.*?)"'
     pat4 = '"message":"(.*?)"'
     item['uname'] = re.compile(pat1).findall(a)
     item['usex'] = re.compile(pat2).findall(a)
     item['usign'] = re.compile(pat3).findall(a)
     item['ucomment'] = re.compile(pat4).findall(a)
     return item
Ejemplo n.º 18
0
    def parse_item(self, response):
        div = response.xpath("//div[@id='viewbox_report']")
        # 标题
        title = div.xpath(".//span/text()").get()
        # 视频分类
        category = ">".join(
            div.xpath(".//span[@class='a-crumbs']/a/text()").getall())
        # 发布时间
        publish_time = div.xpath(".//div[1]/span[2]/text()").get()
        # 播放数
        play_text = div.xpath(".//span[contains(@title, '播放数')]/text()").get()
        play_count = re.sub(r"播放.*", "", play_text)
        # 弹幕数
        barrage_text = div.xpath(
            ".//span[contains(@title, '弹幕数')]/text()").get()
        barrage_count = re.sub(r"弹幕.*", "", barrage_text)
        # 点赞数、投硬数、收藏数
        ops_list = [
            x.strip() for x in response.xpath(
                "//div[@class='ops']/span/text()").getall()
        ]
        like_count = ops_list[0] if ops_list[0] != "点赞" else "0"
        throw_coin_count = ops_list[1] if ops_list[1] != "投币" else "0"
        collection_count = ops_list[2] if ops_list[2] != "收藏" else "0"
        # 评论数
        comment_count = response.xpath(
            "//meta[@itemprop='commentCount']/@content").get()
        # 标签列表
        tag_text = response.xpath(
            "//ul[contains(@class, 'tag-area')]/li//text()").getall()
        tag_names = ",".join(tag_text)

        info = {
            "title": title,
            "category": category,
            "publish_time": publish_time,
            "play_count": play_count,
            "barrage_count": barrage_count,
            "like_count": like_count,
            "throw_coin_count": throw_coin_count,
            "collection_count": collection_count,
            "comment_count": comment_count,
            "tag_names": tag_names
        }
        for k, v in info.copy().items():
            if ("_count" in k) and ("万" in v):
                info[k] = int(float(v.replace("万", "")) * 10000)

        yield BilibiliItem(**info)
Ejemplo n.º 19
0
 def parse_each_cartoon_comment(self, response):
     print('我他妈真的来这里了吗')
     all_comment = response.xpath(
         '//div[@class="review-list-wrp type-short"]//li')
     if all_comment:
         for item in all_comment:
             print('终于进来了!!!')
             Item = BilibiliItem()
             Item['username'] = item.xpath(
                 './/div[@class="review-author-name"]//text()'
             ).extract_first().strip()
             Item['comment'] = item.xpath(
                 './/div[@class="review-content"]//text()').extract_first(
                 ).strip()
             yield Item
Ejemplo n.º 20
0
    def parse(self, response):
        ul = response.xpath('//*[@class="rank-list"]')
        if not ul:
            # 信息获取失败输入日志
            self.log("----------------------%s" % response.url)
        else:
            # 信息获取成功,输出日志
            self.log("++++++++++++++++++++++%s" % response.url)

            # 根据li,获取li的list
            lis = ul[0].xpath('./li')
            items = []
            for bilibili in lis[0:]:
                bilibi_item = BilibiliItem()
                try:
                    #  编号
                    s_vnum = bilibili.xpath(
                        './div[@class="num"]/text()').extract()[0]
                    bilibi_item['vnum'] = s_vnum
                    # 标题
                    s_title = bilibili.xpath(
                        './div[@class="content"]/div[@class="info"]/a/text()'
                    ).extract()[0]
                    bilibi_item['vtitle'] = s_title
                    # url地址
                    s_url = bilibili.xpath(
                        './div[@class="content"]/div[@class="info"]/a/@href'
                    ).extract()[0]
                    #s_pic = bilibili.xpath('div[@class="content"]/div[@class="img"]/a/div/img/@src').extract()[0]
                    bilibi_item['vurl'] = s_url
                    # 综合评分
                    s_pts = bilibili.xpath(
                        './div[@class="content"]/div[@class="info"]/div[@class="pts"]/div/text()'
                    ).extract()[0]
                    bilibi_item['vpts'] = s_pts
                    # s_author = bilibili.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/a/span/@aid').extract()[0]
                    # bilibi_item['author'] = s_id
                    self.log('vnum=:' + bilibi_item['vnum'])
                    self.log('vtitle=:' + bilibi_item['vtitle'])
                    self.log('vurl=:' + bilibi_item['vurl'])
                    self.log('vpts=:' + bilibi_item['vpts'])
                    # print(bilibi_item)
                except IndexError as e:
                    # 如果有个别信息为空获取失败,则输出日志
                    self.log("!!!!!!!!!!!" + str(e))
                items.append(bilibi_item)

            yield items
Ejemplo n.º 21
0
 def parse(self, response):
     # print(response.url)
     js = json.loads(response.body)
     # 判断code是否为0,如果不为0 就是不可以抓取或者不存在的视频
     if js['code'] == 0:
         data = js['data']
         if isinstance(data['view'], int) and data['view'] >= 1000:
             item = BilibiliItem()
             # 将data中的数据都传入item中,并向下继续请求第二级的链接,补全item
             for key in data.keys():
                 item[key] = data[key]
             # yield item
             yield scrapy.Request(
                 'https://www.bilibili.com/video/av{}'.format(data['aid']),
                 meta={'item': item},
                 callback=self.parse_detail)
Ejemplo n.º 22
0
 def parse(self, response):
     temp = response.body
     temp = re.search('({.+})', temp).group(1)
     datas = json.loads(temp)['result']
     item = BilibiliItem()
     if datas:
         for data in datas:
             item['title'] = data['title']
             item['author'] = data['author']
             item['play_count'] = data['play']
             item['danmu_count'] = data['video_review']
             item['arcurl'] = data['arcurl']
             yield item
         page_num = re.search(r'page=(\d+)', response.url).group(1)
         page_num = 'page=' + str(int(page_num)+1)
         next_url = re.sub(r'page=\d+', page_num, response.url)
         yield Request(next_url, headers=self.headers)
Ejemplo n.º 23
0
 def parse2(self, response):
     response = response.text
     response = json.loads(response)
     mid = response["data"]["mid"]
     username = response["data"]["name"]
     head_img = response["data"]["face"]
     register_time = response["data"]["regtime"]
     birthday = response["data"]["birthday"]
     place = response["data"]["place"]
     # print(mid,username,head_img,register_time,birthday,place)
     Item = BilibiliItem()
     Item["userid"] = mid
     Item["username"] = username
     Item["head_img"] = head_img
     Item["register_time"] = register_time
     Item["birthday"] = birthday
     Item["place"] = place
     yield Item
Ejemplo n.º 24
0
    def parse_item(self, response):

        print "*" * 20, response.url
        sel = Selector(response)
        item = BilibiliItem()

        item["url"] = response.url
        item["crawl_time"] = int(time.time())
        item["title"] = ''.join(sel.xpath("//title/text()").extract())
        item["keywords"] = ''.join(
            sel.xpath('//meta[@name="keywords"]/@content').extract())
        item["description"] = ''.join(
            sel.xpath('//meta[@name="description"]/@content').extract())
        item["author"] = ''.join(
            sel.xpath('//meta[@name="author"]/@content').extract())
        item["cover_image"] = ''.join(
            sel.xpath('//img[@class="cover_image"]/@src').extract())
        item["h_title"] = ''.join(sel.xpath('//h1/@title').extract())
        item["startDate"] = ''.join(
            sel.xpath('//time[@itemprop="startDate"]/@datetime').extract())
        item["info"] = self.extract_info(sel)
        item["upinfo"] = self.extract_upinfo(sel)
        item["video_info"] = self.extract_video_info(sel)
        item["tag_list"] = self.extract_tag_list(sel)
        # cid aid
        m = re.findall(r"cid=([0-9]+)\&aid=([0-9]+)", response.body)
        print m
        if len(m) == 1:
            item["cid"] = m[0][0]
            item["aid"] = m[0][1]
            item["stats"] = self.extract_stats(sel, item["aid"], item)
            json_data = json.loads(item["stats"])
            if "data" in json_data:
                json_data2 = json_data["data"]
                item["view"] = json_data2["view"]
                item["danmaku"] = json_data2["danmaku"]
                item["reply"] = json_data2["reply"]
                item["favorite"] = json_data2["favorite"]
                item["coin"] = json_data2["coin"]
                item["share"] = json_data2["share"]
        else:
            print response.url, "cid wrong"

        return item
Ejemplo n.º 25
0
 def parse(self, response):
     """获取ep_id列表、cid列表"""
     # 正则时注意空白字符,内部应关闭贪婪,否会多去
     epList_str = re.findall(r"\"epList\":\s*\[(.*?)\],",
                             response.body.decode(), re.DOTALL)[0]
     # with open("str.html","w") as f:
     #     f.write(epList_str)
     ep_id_list = re.findall(r",\"id\":\s*(.*?),", epList_str, re.DOTALL)
     cid_list = re.findall(r"\"cid\":\s*(.*?),", epList_str, re.DOTALL)
     # 总共104集
     pop_url = "https://comment.bilibili.com/{}.xml"
     av_url = "https://www.bilibili.com/bangumi/play/ep{}"
     for cid in cid_list:
         item = BilibiliItem()
         item["av_url"] = av_url.format(ep_id_list[cid_list.index(cid)])
         url = pop_url.format(cid)
         yield scrapy.Request(url,
                              callback=self.detail,
                              meta={"item": item})
Ejemplo n.º 26
0
 def parse(self, response):
     item = BilibiliItem()
     drama = json.loads(response.text)
     data = drama['data']
     data_list = data['list']
     #print(data_list)
     for filed in data_list:
         item['number'] = self.i
         item['badge'] = filed['badge']
         item['cover_img'] = filed['cover']
         item['index_show'] = filed['index_show']
         item['link'] = filed['link']
         item['media_id'] = filed['media_id']
         item['order_type'] = filed['order_type']
         item['season_id'] = filed['season_id']
         item['title'] = filed['title']
         #print(self.i, item)
         self.i += 1
         yield item
Ejemplo n.º 27
0
 def parse(self, response):
     path = AbsDirectory.file_path + 'bilibili/bilibili/spiders/tomcat/full/'
     for x in os.listdir(path):
         os.remove(path + x)
     cookie1 = response.headers.getlist('Set-Cookie')
     cookies = {}
     for i in cookie1:
         i = str(i)
         l1 = i.split(';')
         l2 = l1[0].split('=')
         cookies[l2[0]] = l2[1]
     a = BilibiliItem()
     #url ='https://ali-video.acfun.cn/mediacloud/acfun/acfun_video/segment/'+self.start_urls[0]
     a['file_urls'] = [self.start_urls[0]]
     # if self.start_urls[0] is not None:
     # yield scrapy.Request(self.start_urls[0], cookies=cookies, callback=self.parse_2)
     yield scrapy.Request(self.start_urls[0],
                          cookies=cookies,
                          callback=self.parse_2)
Ejemplo n.º 28
0
 def parse1(self, response):
     item = BilibiliItem()
     page=Selector(response)
     try:
         titles=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/a/text()').extract()
         urls=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/a/@href').extract()
         texts=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/div[@class="v-desc"]/text()').extract()
         peoples=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/div[@class="v-info"]/span[1]/span/text()').extract()
         danmus=page.xpath('//ul[@class="vd-list mod-2"]/li/div/div[2]/div[@class="v-info"]/span[2]/span/text()').extract()
         class_name = page.xpath('//li[@class="on"]/a/text()').extract()
         item['class_name'] = class_name[0]
         for i in range(0,len(titles)):
             item['title'] = self.cleanInput(titles[i])
             item['url'] = urls[i]
             item['text'] = self.cleanInput(texts[i])
             item['people'] = peoples[i]
             item['danmu'] = danmus[i]
             yield item
     except:
         pass
Ejemplo n.º 29
0
    def next_information_parse(self, response):
        print "HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH"
        print response.status
        if response.status == 403:
            print "HHHHHHHHHHHHHHHHHHHHHHHHHH page_information_parse HHHHHHHHHHHHHHHHHHHHHHHHHH"
            time.sleep(60 * 6)
            yield scrapy.Request(response.url,
                                 headers=response.request.headers,
                                 meta={"status": 403})
            return

        print response.url
        self.sum += 1
        print "计数 : " + str(self.sum)

        try:
            info = json.loads(response.body[38:-1], strict=False)
            for archive in info["data"]["archives"]:
                item = BilibiliItem()
                item["aid"] = archive["aid"]
                item["cid"] = archive["cid"]
                item["copyright"] = archive["copyright"]  # 版权:1、可以正常转载;2、无法转载
                item["tname"] = archive["tname"]  # 类别
                item["title"] = archive["title"]
                item["videos"] = archive["videos"]  # page数量
                item["ctime"] = archive["ctime"]  # 创建时间
                item["pubdate"] = archive["pubdate"]  # 更新时间
                item["duration"] = archive["duration"]  # 视频时长
                item["coin"] = archive["stat"]["coin"]
                item["favorite"] = archive["stat"]["favorite"]
                item["likes"] = archive["stat"]["like"]
                item["archive"] = json.dumps(archive)
                yield item
        except Exception as e:
            print e
            if "try" not in response.meta:
                yield scrapy.Request(response.url,
                                     callback=self.next_information_parse,
                                     meta={"try": 1},
                                     headers=response.request.headers,
                                     dont_filter=False)
Ejemplo n.º 30
0
 def parse(self, response):
     try:
         sql.delete_requested(response.meta['page'])
         item = BilibiliItem()
         jsdict = json.loads(response.text)
         jsdata = jsdict['data']
         item['name_'] = str(jsdata['name'])
         item['uid'] = jsdata['mid']
         item['play_num'] = jsdata['playNum']
         item['sex'] = jsdata['sex']
         if 'birthday' in jsdata.keys():
             item['birthday'] = jsdata['birthday'][5:]
         else:
             item['birthday'] = ''
         if 'place' in jsdata.keys():
             item['area'] = jsdata['place']
         else:
             item['area'] = ''
         if 'regtime' in jsdata.keys():
             reg_time = time.localtime(jsdata['regtime'])
             item['reg_time'] = time.strftime('%Y-%m-%d', reg_time)
         else:
             item['reg_time'] = ''
         item['coins'] = jsdata['coins']
         item['article'] = jsdata['article']
         item['level_'] = jsdata['level_info']['current_level']
         item['exp'] = jsdata['level_info']['current_exp']
         item['description'] = jsdata['description']
         url = 'http://api.bilibili.com/x/relation/stat?vmid=' + response.meta[
             'page'] + '&jsonp=jsonp'
         try:
             data = requests.get(url).text
             js_fans = json.loads(data)
             item['following'] = js_fans['data']['following']
             item['fans'] = js_fans['data']['follower']
         except:
             pass
         return item
     except:
         print('uid:%d 不存在' % (int(response.meta['page'])))