Exemple #1
0
    def parse(self, response):
        response_json = json.loads(response.text)

        room_list = response_json.get('data').get('items')
        for room in room_list:
            item = LiveItem()
            channel = room.get('channel')
            item['rid'] = str(channel.get('id'))
            item['nn'] = channel.get('name')
            item['rn'] = channel.get('status')
            item['category'] = room.get('game')[0].get('name')
            if item['category'] == '地下城与勇士':
                item['category'] = 'DNF'
            if item['category'] == '主机游戏(综合)':
                item['category'] = '主机游戏'
            item['url'] = channel.get('url')
            item['ol'] = room.get('viewers')
            item['preview'] = room.get('preview')
            item['live_category'] = self.live_category
            yield item

        self.page_num += 1
        if self.page_num <= self.max_page_num:
            yield response.follow(self.base_url + str(self.page_num * 18 - 1),
                                  callback=self.parse)
Exemple #2
0
 def parse(self, response):
     items = ItemLoader(item=LiveItem(), response=response)
     for content in response.xpath('//*[@id="sortdetail-container"]/li/a'):
         i = ItemLoader(item=LiveItem(), selector=content)
         #标题
         i.add_xpath('title', 'div[2]/span[1]/text()')
         #用户名
         i.add_xpath('username', 'div[2]/span[2]/@title')
         #热度
         i.add_xpath('num', 'div[2]/span[4]/i/text()')
         #图片的地址
         i.add_xpath('pic_addr', 'div[1]/img/@data-original')
         #直播间的相对地址
         i.add_xpath('addr', '@href')
         #直播平台
         i.add_value('platform', 'panda')
         yield i.load_item()
Exemple #3
0
 def parse(self, response):
     items = ItemLoader(item=LiveItem(),response=response)
     for content in response.xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/ul/li/a'):
             i = ItemLoader(item=LiveItem(),selector=content)
             #标题
             i.add_xpath('title','span[3]/text()')
             #用户名
             i.add_xpath('username','span[5]/text()')
             #热度
             i.add_xpath('num','span[2]/text()')
             #图片的地址
             i.add_xpath('pic_addr','span[1]/span[1]/img/@data-original')
             #直播间的相对地址
             i.add_xpath('addr','@href')
             #直播平台
             i.add_value('platform','yy')
             yield i.load_item()
Exemple #4
0
 def parse(self, response):
     items = ItemLoader(item=LiveItem(),response=response)
     for content in response.xpath('//*[@id="live-list-contentbox"]/li/a'):
             i = ItemLoader(item=LiveItem(),selector=content)
             #标题
             i.add_xpath('title','div/div/h3/text()')
             #用户名
             i.add_xpath('username','div[1]/p/span[1]/text()')
             #热度
             i.add_xpath('num','div[1]/p/span[2]/text()')
             #图片的地址
             i.add_xpath('pic_addr','span/img/@data-original')
             #直播间的相对地址
             i.add_xpath('addr','@href')
             #直播平台
             i.add_value('platform','douyu')
             yield i.load_item()
Exemple #5
0
    def room_json(self, response):
        response_json = json.loads(response.text)
        result_code = response_json.get('resultCode')

        if result_code == 0:
            room_list = response_json.get('data').get('data')
            for room in room_list:
                item = LiveItem()
                item['rid'] = str(room.get('id'))
                item['nn'] = room.get('name')
                item['rn'] = room.get('desc')
                item['category'] = response.meta['category']
                if item['category'] == 'LOL':
                    item['category'] = '英雄联盟'
                item['url'] = 'http://www.yy.com' + room.get('liveUrl')
                item['ol'] = room.get('users')
                item['preview'] = room.get('thumb')
                item['live_category'] = self.live_category
                yield item
Exemple #6
0
    def parse(self, response):
        response_json = json.loads(response.text)
        message = response_json.get('message')
        if message == '':
            room_list = response_json.get('data').get('rooms')
            for room in room_list:
                item = LiveItem()
                item['rid'] = str(room.get('id'))
                item['nn'] = room.get('nickname')
                item['rn'] = room.get('title')
                item['category'] = room.get('gameName')
                item['url'] = 'https://www.zhanqi.tv/' + room.get('url')
                item['ol'] = room.get('online')
                item['preview'] = room.get('spic')
                item['live_category'] = self.live_category
                yield item

        self.page_num += 1
        if self.page_num <= self.max_page_num:
            yield response.follow(self.base_url + str(self.page_num) + '.json',
                                  callback=self.parse)
Exemple #7
0
    def parse(self, response):
        response_json = json.loads(response.text)
        status = response_json.get('msg')
        if status == 'success':
            room_list = response_json.get('data').get('rl')
            for room in room_list:
                item = LiveItem()
                item['rid'] = str(room.get('rid'))
                item['nn'] = room.get('nn')
                item['rn'] = room.get('rn')
                item['category'] = room.get('c2name')
                item['url'] = 'https://www.douyu.com' + room.get('url')
                item['ol'] = room.get('ol')
                item['preview'] = room.get('rs1')
                item['live_category'] = self.live_category
                yield item

        self.page_num += 1
        if self.page_num <= self.max_page_num:
            yield response.follow(self.base_url + str(self.page_num),
                                  callback=self.parse)
Exemple #8
0
    def parse(self, response):
        response_json = json.loads(response.text)
        errno = response_json.get('errno')
        if errno == 0:
            room_list = response_json.get('data').get('items')
            for room in room_list:
                item = LiveItem()
                item['rid'] = str(room.get('id'))
                item['nn'] = room.get('userinfo').get('nickName')
                item['rn'] = room.get('name')
                item['category'] = room.get('classification').get('cname')
                if item['category'] == '户外直播':
                    item['category'] = '户外'
                item['url'] = 'https://www.panda.tv/' + room.get('id')
                item['ol'] = room.get('person_num')
                item['preview'] = room.get('pictures').get('img')
                item['live_category'] = self.live_category
                yield item

        self.page_num += 1
        if self.page_num <= self.max_page_num:
            yield response.follow(self.base_url + str(self.page_num),
                                  callback=self.parse)
Exemple #9
0
    def parse_content(self, response):
        item = LiveItem()
        sel2 = Selector(response)
        try:
            imgurls = sel2.xpath(
                r'//figure[@class="single-thumb single-thumb-full"]/img/@src'
            ).extract()
            if not imgurls:
                list = ['']
                item['image_urls'] = list
            else:
                listurl1 = []
                list = []
                for imgurl in imgurls:
                    picname1 = imgurl.split('?')[-2]
                    picurl = ''.join(picname1)
                    listurl1.append(picurl)
                    picname2 = picname1.split('/')[-1]
                    list.append(picname2)
                item['image_urls'] = listurl1
            desf = sel2.xpath('//div[@class="entry-content"]/ul').extract()
            list1 = []
            if not desf:
                list1 = ['']
            else:
                for dea in desf:
                    tj = '<a href="'
                    if tj in dea:
                        dess1 = re.sub('<a.*?">', '', dea)
                        des = re.sub('</a>', '', dess1)
                        a1 = ''.join(des)
                        list1.append(a1)
                    else:
                        des = dea
                        a1 = ''.join(des)
                        list1.append(a1)
            now = datetime.datetime.now()
            bb = now.strftime('%Y-%m-%d')
            cc = str(bb)
            listd = []
            listd.append(cc)
            item['datime'] = listd
            title = sel2.xpath('//h1[@class="entry-title"]/text()').extract()
            item['title'] = title
            item['category'] = ['news']
            data = sel2.xpath('//div[@class="entry-content"]/p').extract()
            listae = []
            dalist = []
            for da in data:
                tj = '<a href="'
                tj2 = '<img'
                tj3 = '<p><iframe'
                tj4 = '<p style='
                tj5 = '<script'
                if tj in da:
                    datas1 = re.sub('<a.*?">', '', da)
                    data1 = re.sub('</a>', '', datas1)
                else:
                    data1 = da
                if tj2 in data1:
                    data2 = re.sub('<img.*?>', '', data1)
                else:
                    data2 = data1
                if tj3 in data2:
                    data3 = re.sub('<p><iframe.*?</p>', '', data2)
                else:
                    data3 = data2
                if tj4 in data3:
                    data4 = re.sub('<p style=.*?</p>', '', data3)
                else:
                    data4 = data3
                if tj5 in data4:
                    data5 = re.sub('<script.*?</script>', '', data4)
                    data11 = ''.join(data5)
                    dalist.append(data11)
                else:
                    data5 = data4
                    data11 = ''.join(data5)
                    dalist.append(data11)
            article = ''.join(dalist)
            listae.append(article)
            item['article'] = listae
            if '' in list1:
                item['des'] = ['']
            else:
                item['des'] = list1
            if '' in list:
                item['img'] = ['']

            else:
                imgl = []
                for tit in title:
                    tita = tit
                for imga in list:
                    imgadress = 'http://www.actualites-les.com/static/images/lac/' + imga
                    img = '<img src="' + imgadress + '" width="600" height="350" alt="' + tita + '">'
                    a = ''.join(img)
                    imgl.append(a)
                item['img'] = imgl
            yield item

        except Exception as e:
            print '内容解析错误原因:', e