def parse(self, response): list = response.css('.m-table tbody<tr') for tr in list: css = tr.css('td b::attr(title)') name = css[0].css('td b::attr(title)').extract_first() singer = css[3].css('span::attr(title)').extract_first() item = MusicItem() item.name = name item.singer = singer yield item
def parse(self, response): for sel in response.xpath('//section[@class="latest-tracks"]/ul[@class="latest-tracks tracks-list clearfix"]/li'):#<-REMEMBER IT MUST BE A LI item = MusicItem() item ['artist'] = sel.xpath('.//span[@class="artist"]/text()').extract()#<-THE .// MEANS SEARCH WITHIN THE LOOP item ['trackName'] = sel.xpath('.//span[@class="name"]/text()').extract() item ['trackUrl'] = sel.xpath('.//@href').extract() yield item
def parse_item(self, response): hxs = HtmlXPathSelector(response) i = MusicItem() #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract() #i['name'] = hxs.select('//div[@id="name"]').extract() #i['description'] = hxs.select('//div[@id="description"]').extract() return i
def list_parse(self, response): self.allPages=1 #self.allPages=int(response.xpath('//div[@class="page-inner"]/a//text()').extract()[-2])#需注意获取的是string,要转化为int songList=response.xpath('//div[@class="search-song-list song-list song-list-hook"]/ul/li/div[@class="song-item clearfix "]') for result in songList: item = MusicItem() try: if(result.xpath('./span[@class="song-title"]/a/@class').extract()[0]=="no-link"): return else: item['song_tag'] = self.tagDict[response.meta["tag"]] item['song_id'] = result.xpath('./span[@class="song-title"]/a/@href').extract()[0].replace("/song/", "") item['song_title'] = result.xpath('./span[@class="song-title"]//text()').extract()[0] item['singer_name'] = result.xpath('./span[@class="singer"]/span[@class="author_list"]/@title').extract()[0] try: item['album_title'] = result.xpath('./span[@class="album-title"]/a/text()').extract()[0] except: item['album_title'] = None song_url = "http://music.baidu.com/data/music/songlink" yield FormRequest(url=song_url, formdata={'songIds': item['song_id'], 'type': 'mp3'}, dont_filter=True, meta={'item': item}, callback=self.song_parse) except Exception as e: print e print '爬取list出现错误' tag=response.meta["tag"] page=int(response.meta["page"]+1) if page<self.allPages: url = self.baseUrl % (tag, page * self.size) yield scrapy.Request(url, callback=self.list_parse, dont_filter=True, meta={'tag': tag, 'page': page})
def parse_comment(self, response): id = response.meta['id'] music = response.meta['music'] artist = response.meta['artist'] album = response.meta['album'] result = json.loads(response.text) comments = [] if 'hotComments' in result.keys(): for comment in result.get('hotComments'): hotcomment_author = comment['user']['nickname'] hotcomment = comment['content'] hotcomment_like = comment['likedCount'] hotcomment_avatar = comment['user']['avatarUrl'] data = { 'nickname': hotcomment_author, 'content': hotcomment, 'likedcount': hotcomment_like, 'avatarurl': hotcomment_avatar } comments.append(data) item = MusicItem() for field in item.fields: try: item[field] = eval(field) except: print('Field is not defined', field) yield item
def parse(self, response): item_loader = ItemLoader(item=MusicItem(), response=response) item_loader.add_xpath( "url", "//ul[@class='listMusic']//div[@class='name']/a/@href") item_loader.add_xpath( "name", "//ul[@class='listMusic']//div[@class='name']/a/text()") print(item_loader.load_item())
def parse(self, response): for sel in response.xpath('//ul[@class="cont"]/li'): item = MusicItem() item ['artist'] = ''.join(sel.xpath('.//b/text()').extract()) item ['trackName'] = ''.join(sel.xpath('.//strong/text()').extract()) item ['trackUrl'] = urlparse.urljoin('http://www.viperial.cc',''.join(sel.xpath('a/@href').extract())) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//ul[@class = "col5"]/li') items = [] idex = 0 for k in sites: idex += 1 i = MusicItem() i['rank'] = int(k.select('span[@class = "green-num-box"]/text()').extract()[0]) i['name'] = k.select('div/h3/a/text()|div/p/a/text()').extract()[0].split('(')[0].split('[')[0].split('<')[0] if idex < 11: a = k.select('div/p/text()').extract()[0] i['singer'] = a.split('/')[0][:-1] i['num'] = int(a.split('/')[1][1:-3]) else: a = k.select('div/p/text()').extract()[1] i['singer'] = a.split('/')[0][17:-1] i['num'] = int(a.split('/')[1][1:-16]) i['days'] = int(k.select('span[@class = "days"]/text()').extract()[0][3:-2]) b = k.select('span[contains(@class, "trend")]/@class').extract()[0][-1] if b == "p":i['change'] = "up" elif b == "n":i['change'] = "down" else: i['change'] = "no" i['changedays'] = int(k.select('span[contains(@class,"trend")]/text()').extract()[0][1]) items.append(i) return items
def parse_item(self, response): hxs = HtmlXPathSelector(response) i = MusicItem() i['comments'] = long(hxs.select('//span[@id="cnt_comment_count"]/text()').extract()[0]) i['rank'] = response.meta['rank'] i['name'] = response.meta['name'] i['singer'] = response.meta['singer'] return i
def parse_song(self, response): json_dict = json.loads(response.text) if json_dict['total'] >= 1000 and len(json_dict['hotComments']) != 0: name_url = "https://music.163.com/song?id=" + response.meta[ 'song_id'] name_text = requests.get(name_url, headers=self.headers).text singer_obj = re.match('.*?歌手:(.*?)。', name_text, re.S) ablum_obj = re.match('.*?所属专辑:(.*?)。', name_text, re.S) song_obj = re.match('.*?<title>(.*?)-', name_text, re.S) singer = singer_obj.group(1) ablum = ablum_obj.group(1) song_name = song_obj.group(1) print(json_dict['total']) for item in json_dict['hotComments']: music_item = MusicItem() item_loader = MusicItemLoader(item=MusicItem(), response=response) item_loader.add_value("comment_id", item["commentId"]) item_loader.add_value("comment_date", item['time']) item_loader.add_value("singer", singer) item_loader.add_value("ablum", ablum) item_loader.add_value("song_name", song_name) item_loader.add_value("liked_count", item['likedCount']) item_loader.add_value("user_avatar_url", item['user']['avatarUrl']) item_loader.add_value("user_nickname", item['user']['nickname']) item_loader.add_value("comment", item['content']) item_loader.add_value("song_id", response.meta["song_id"]) # discuss = [] # if len(item['beReplied'])!=0: # for comment in item['beReplied']: # data = { # "comment":comment['content'], # "user_avatar_url":comment['user']['avatarUrl'], # "user_nickname":comment['user']['nickname'] # } # discuss.append(data) # item_loader.add_value("discuss",discuss) music_item = item_loader.load_item() yield music_item
def parse_singer(self,response): li_list = response.xpath('//ul[@id="m-artist-box"]/li') for li in li_list: item = MusicItem() name = li.xpath('./p/a[1]/text()|./a[1]/text()').extract_first() #https://music.163.com/#/artist/desc?id=28387245 #https://music.163.com /artist?id=28387245 url = 'https://music.163.com' + li.xpath('./p/a[1]/@href|./a[1]/@href').extract_first().split('?')[0].lstrip()+'/desc?'+li.xpath('./p/a[1]/@href|./a[1]/@href').extract_first().split('?')[1] item['name'] = name item['url'] = url yield scrapy.Request(url=url,callback=self.parse_desc,meta={'item':item},dont_filter=True)
def parse(self, response): for sel in response.xpath('//section[@class="latest-tracks"]'): item = MusicItem() item['artist'] = ''.join( sel.xpath('.//span[@class="artist"]/text()').extract()) item['trackName'] = ''.join( sel.xpath('.//span[@class="name"]/text()').extract()) item['trackUrl'] = urlparse.urljoin( 'http://www.viperial.info', ''.join( sel.xpath( './/ul[@class="latest-tracks tracks-list clearfix"]//@href' ).extract())) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//ol/li') items = [] for k in sites: i = MusicItem() i['rank'] = k.select('span[@class="list_no"]/text()').extract()[0] i['name'] = k.select('div[@class = "music_name"]/span/a/@title').extract()[0] i['singer'] = k.select('div[@class = "singer_name"]/span/a/text()').extract()[0] i['rise'] = k.select('span[@class = "exponent"]/text()').extract()[0] #i['num'] = k.select('div[@class="count"]/span/text()').extract() items.append(i) return items
def parse_item(self, response): item = MusicItem() print(response.url) mete = response.xpath('/html/head/meta[7]/@content').extract() print(mete) sid = re.findall(re.compile(r'id=([0-9]{1,20})'), response.url) name = re.findall(re.compile(r'歌手:(.*?)。'), "".join(mete)) belong = re.findall(re.compile(r'所属专辑:(.*?)。'), "".join(mete)) print(name, belong) item["song_id"] = "".join(sid) item["down_url"] = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format("".join(sid)) item["name"] = "".join(name) item["belong"] = "".join(belong) return item
def parse_content(self, response): print("----------------------------2") selector = etree.HTML(response.text) song_list = selector.xpath('//div[@class="lrc_main"]/text()') song = [] for line in song_list: song.append(line.strip()) result = ', '.join(song) item = MusicItem() item['title'] = response.meta['title'] item['song'] = result print(item) yield item
def parse_comment(self, response): id = response.meta['id'] music = response.meta['music'] artist = response.meta['artist'] album = response.meta['album'] result = json.loads(response.text) comments = [] if 'hotComments' in result.keys(): for comment in result.get('hotComments'): hotcomment_author = comment['user']['nickname'] hotcomment = comment['content'] hotcomment_like = comment['likedCount'] # 这里我们将评论的作者头像也保存,如果大家喜欢这个项目,我后面可以做个web端的展现 hotcomment_avatar = comment['user']['avatarUrl'] data = { 'nickname': hotcomment_author, 'content': hotcomment, 'likedcount': hotcomment_like, 'avatarurl': hotcomment_avatar } comments.append(data) item = MusicItem() # 由于eval方法不稳定,具体的可以自己搜索,我们过滤一下错误 # for field in item.fields: # try: # item[field] = eval(field) # except: # print('Field is not defined', field) item['id'] = id item['artist'] = artist item['album'] = album item['music'] = music item['comments'] = comments yield item # def parse(self, response): # for each in response.xpath('//ul[@id="m-pl-container"]/li'): # item = MusicItem() # item['count'] = each.xpath('./div/div/span[2]/text()').extract()[0] # item['namess'] =each.xpath('./p[1]/a/text()').extract()[0] # yield item
def parse(self, response): hxs=HtmlXPathSelector(response) sites = hxs.select('//tbody/tr') p = 0 for k in sites: p += 1 i = MusicItem() i['rank'] = int(k.select('td/div/span[contains(@class,"num")]/text()').extract()[0]) if p < 4: url = k.select('td/div/div/div/span[@class="txt"]/strong/a/@href').extract()[0] i['name'] = k.select('td/div/div/div/span/strong/@title').extract()[0].split('(')[0].split('[')[0].split('\u3010')[0].split('<')[0] else: url = k.select('td/div/div/div/span[@class="txt"]/b/a/@href').extract()[0] i['name'] = k.select('td/div/div/div/span/b/@title').extract()[0].split('(')[0].split('[')[0].split('\u3010')[0].split('<')[0] i['singer'] = k.select('td/div[@class="text"]/span/@title').extract()[0] yield Request("http://music.163.com"+url, meta = {'rank':i['rank'],'name':i['name'],'singer':i['singer']}, callback=self.parse_item)
from selenium.webdriver.support import expected_conditions as ec from music.items import MusicItem musics = list() with open('./music_html', 'r+', encoding='utf-8') as f: html = f.readline() root = etree.HTML(html) names = root.xpath("//div[@class='f-cb']//b/@title") artists = root.xpath("//div[@class='text']/@title") print(names) print(artists) # with open('./music.csv', 'a', encoding='utf-8') as f: for i in range(len(names)): item = MusicItem() item['name'] = names[i] item['singer'] = artists[i] musics.append(item) # f.write(names[i]+"|"+artists[i]) # f.write('\n') option = webdriver.ChromeOptions() # option.add_argument("--start-maximized") # option.add_argument('--no-sandbox') # ubuntu 需要这个参数 # option.add_argument("--incognito") # option.add_argument("--headless") # 不弹出浏览器 timeout = 600 # option.add_argument( # '--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"') # option.add_argument('--') browser = webdriver.Chrome(chrome_options=option)