def parse(self, response): response_json = json.loads(response.text) room_list = response_json.get('data').get('items') for room in room_list: item = LiveItem() channel = room.get('channel') item['rid'] = str(channel.get('id')) item['nn'] = channel.get('name') item['rn'] = channel.get('status') item['category'] = room.get('game')[0].get('name') if item['category'] == '地下城与勇士': item['category'] = 'DNF' if item['category'] == '主机游戏(综合)': item['category'] = '主机游戏' item['url'] = channel.get('url') item['ol'] = room.get('viewers') item['preview'] = room.get('preview') item['live_category'] = self.live_category yield item self.page_num += 1 if self.page_num <= self.max_page_num: yield response.follow(self.base_url + str(self.page_num * 18 - 1), callback=self.parse)
def parse(self, response): items = ItemLoader(item=LiveItem(), response=response) for content in response.xpath('//*[@id="sortdetail-container"]/li/a'): i = ItemLoader(item=LiveItem(), selector=content) #标题 i.add_xpath('title', 'div[2]/span[1]/text()') #用户名 i.add_xpath('username', 'div[2]/span[2]/@title') #热度 i.add_xpath('num', 'div[2]/span[4]/i/text()') #图片的地址 i.add_xpath('pic_addr', 'div[1]/img/@data-original') #直播间的相对地址 i.add_xpath('addr', '@href') #直播平台 i.add_value('platform', 'panda') yield i.load_item()
def parse(self, response): items = ItemLoader(item=LiveItem(),response=response) for content in response.xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/ul/li/a'): i = ItemLoader(item=LiveItem(),selector=content) #标题 i.add_xpath('title','span[3]/text()') #用户名 i.add_xpath('username','span[5]/text()') #热度 i.add_xpath('num','span[2]/text()') #图片的地址 i.add_xpath('pic_addr','span[1]/span[1]/img/@data-original') #直播间的相对地址 i.add_xpath('addr','@href') #直播平台 i.add_value('platform','yy') yield i.load_item()
def parse(self, response): items = ItemLoader(item=LiveItem(),response=response) for content in response.xpath('//*[@id="live-list-contentbox"]/li/a'): i = ItemLoader(item=LiveItem(),selector=content) #标题 i.add_xpath('title','div/div/h3/text()') #用户名 i.add_xpath('username','div[1]/p/span[1]/text()') #热度 i.add_xpath('num','div[1]/p/span[2]/text()') #图片的地址 i.add_xpath('pic_addr','span/img/@data-original') #直播间的相对地址 i.add_xpath('addr','@href') #直播平台 i.add_value('platform','douyu') yield i.load_item()
def room_json(self, response): response_json = json.loads(response.text) result_code = response_json.get('resultCode') if result_code == 0: room_list = response_json.get('data').get('data') for room in room_list: item = LiveItem() item['rid'] = str(room.get('id')) item['nn'] = room.get('name') item['rn'] = room.get('desc') item['category'] = response.meta['category'] if item['category'] == 'LOL': item['category'] = '英雄联盟' item['url'] = 'http://www.yy.com' + room.get('liveUrl') item['ol'] = room.get('users') item['preview'] = room.get('thumb') item['live_category'] = self.live_category yield item
def parse(self, response): response_json = json.loads(response.text) message = response_json.get('message') if message == '': room_list = response_json.get('data').get('rooms') for room in room_list: item = LiveItem() item['rid'] = str(room.get('id')) item['nn'] = room.get('nickname') item['rn'] = room.get('title') item['category'] = room.get('gameName') item['url'] = 'https://www.zhanqi.tv/' + room.get('url') item['ol'] = room.get('online') item['preview'] = room.get('spic') item['live_category'] = self.live_category yield item self.page_num += 1 if self.page_num <= self.max_page_num: yield response.follow(self.base_url + str(self.page_num) + '.json', callback=self.parse)
def parse(self, response): response_json = json.loads(response.text) status = response_json.get('msg') if status == 'success': room_list = response_json.get('data').get('rl') for room in room_list: item = LiveItem() item['rid'] = str(room.get('rid')) item['nn'] = room.get('nn') item['rn'] = room.get('rn') item['category'] = room.get('c2name') item['url'] = 'https://www.douyu.com' + room.get('url') item['ol'] = room.get('ol') item['preview'] = room.get('rs1') item['live_category'] = self.live_category yield item self.page_num += 1 if self.page_num <= self.max_page_num: yield response.follow(self.base_url + str(self.page_num), callback=self.parse)
def parse(self, response): response_json = json.loads(response.text) errno = response_json.get('errno') if errno == 0: room_list = response_json.get('data').get('items') for room in room_list: item = LiveItem() item['rid'] = str(room.get('id')) item['nn'] = room.get('userinfo').get('nickName') item['rn'] = room.get('name') item['category'] = room.get('classification').get('cname') if item['category'] == '户外直播': item['category'] = '户外' item['url'] = 'https://www.panda.tv/' + room.get('id') item['ol'] = room.get('person_num') item['preview'] = room.get('pictures').get('img') item['live_category'] = self.live_category yield item self.page_num += 1 if self.page_num <= self.max_page_num: yield response.follow(self.base_url + str(self.page_num), callback=self.parse)
def parse_content(self, response): item = LiveItem() sel2 = Selector(response) try: imgurls = sel2.xpath( r'//figure[@class="single-thumb single-thumb-full"]/img/@src' ).extract() if not imgurls: list = [''] item['image_urls'] = list else: listurl1 = [] list = [] for imgurl in imgurls: picname1 = imgurl.split('?')[-2] picurl = ''.join(picname1) listurl1.append(picurl) picname2 = picname1.split('/')[-1] list.append(picname2) item['image_urls'] = listurl1 desf = sel2.xpath('//div[@class="entry-content"]/ul').extract() list1 = [] if not desf: list1 = [''] else: for dea in desf: tj = '<a href="' if tj in dea: dess1 = re.sub('<a.*?">', '', dea) des = re.sub('</a>', '', dess1) a1 = ''.join(des) list1.append(a1) else: des = dea a1 = ''.join(des) list1.append(a1) now = datetime.datetime.now() bb = now.strftime('%Y-%m-%d') cc = str(bb) listd = [] listd.append(cc) item['datime'] = listd title = sel2.xpath('//h1[@class="entry-title"]/text()').extract() item['title'] = title item['category'] = ['news'] data = sel2.xpath('//div[@class="entry-content"]/p').extract() listae = [] dalist = [] for da in data: tj = '<a href="' tj2 = '<img' tj3 = '<p><iframe' tj4 = '<p style=' tj5 = '<script' if tj in da: datas1 = re.sub('<a.*?">', '', da) data1 = re.sub('</a>', '', datas1) else: data1 = da if tj2 in data1: data2 = re.sub('<img.*?>', '', data1) else: data2 = data1 if tj3 in data2: data3 = re.sub('<p><iframe.*?</p>', '', data2) else: data3 = data2 if tj4 in data3: data4 = re.sub('<p style=.*?</p>', '', data3) else: data4 = data3 if tj5 in data4: data5 = re.sub('<script.*?</script>', '', data4) data11 = ''.join(data5) dalist.append(data11) else: data5 = data4 data11 = ''.join(data5) dalist.append(data11) article = ''.join(dalist) listae.append(article) item['article'] = listae if '' in list1: item['des'] = [''] else: item['des'] = list1 if '' in list: item['img'] = [''] else: imgl = [] for tit in title: tita = tit for imga in list: imgadress = 'http://www.actualites-les.com/static/images/lac/' + imga img = '<img src="' + imgadress + '" width="600" height="350" alt="' + tita + '">' a = ''.join(img) imgl.append(a) item['img'] = imgl yield item except Exception as e: print '内容解析错误原因:', e