Esempio n. 1
0
 def parse(self, response):
     list = response.css('.m-table tbody<tr')
     for tr in list:
         css = tr.css('td b::attr(title)')
         name = css[0].css('td b::attr(title)').extract_first()
         singer = css[3].css('span::attr(title)').extract_first()
         item = MusicItem()
         item.name = name
         item.singer = singer
         yield item
Esempio n. 2
0
 def parse(self, response):
     for sel in response.xpath('//section[@class="latest-tracks"]/ul[@class="latest-tracks tracks-list clearfix"]/li'):#<-REMEMBER IT MUST BE A LI
             item = MusicItem()
             item ['artist'] = sel.xpath('.//span[@class="artist"]/text()').extract()#<-THE .// MEANS SEARCH WITHIN THE LOOP
             item ['trackName'] = sel.xpath('.//span[@class="name"]/text()').extract()
             item ['trackUrl'] = sel.xpath('.//@href').extract()
             yield item
Esempio n. 3
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     i = MusicItem()
     #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract()
     #i['name'] = hxs.select('//div[@id="name"]').extract()
     #i['description'] = hxs.select('//div[@id="description"]').extract()
     return i
Esempio n. 4
0
    def list_parse(self, response):
        self.allPages=1
        #self.allPages=int(response.xpath('//div[@class="page-inner"]/a//text()').extract()[-2])#需注意获取的是string,要转化为int
        songList=response.xpath('//div[@class="search-song-list song-list song-list-hook"]/ul/li/div[@class="song-item clearfix "]')
        for result in songList:
            item = MusicItem()
            try:
                if(result.xpath('./span[@class="song-title"]/a/@class').extract()[0]=="no-link"):
                     return
                else:
                    item['song_tag'] = self.tagDict[response.meta["tag"]]
                    item['song_id'] = result.xpath('./span[@class="song-title"]/a/@href').extract()[0].replace("/song/", "")
                    item['song_title'] = result.xpath('./span[@class="song-title"]//text()').extract()[0]
                    item['singer_name'] = result.xpath('./span[@class="singer"]/span[@class="author_list"]/@title').extract()[0]
                    try:
                        item['album_title'] = result.xpath('./span[@class="album-title"]/a/text()').extract()[0]
                    except:
                        item['album_title'] = None

                    song_url = "http://music.baidu.com/data/music/songlink"
                    yield FormRequest(url=song_url,
                                      formdata={'songIds': item['song_id'], 'type': 'mp3'},
                                      dont_filter=True,
                                      meta={'item': item},
                                      callback=self.song_parse)
            except Exception as e:
                print e
                print '爬取list出现错误'

        tag=response.meta["tag"]
        page=int(response.meta["page"]+1)
        if page<self.allPages:
            url = self.baseUrl % (tag, page * self.size)
            yield scrapy.Request(url, callback=self.list_parse, dont_filter=True, meta={'tag': tag, 'page': page})
Esempio n. 5
0
    def parse_comment(self, response):
        id = response.meta['id']
        music = response.meta['music']
        artist = response.meta['artist']
        album = response.meta['album']
        result = json.loads(response.text)
        comments = []
        if 'hotComments' in result.keys():
            for comment in result.get('hotComments'):
                hotcomment_author = comment['user']['nickname']
                hotcomment = comment['content']
                hotcomment_like = comment['likedCount']
                hotcomment_avatar = comment['user']['avatarUrl']
                data = {
                    'nickname': hotcomment_author,
                    'content': hotcomment,
                    'likedcount': hotcomment_like,
                    'avatarurl': hotcomment_avatar
                }
                comments.append(data)

        item = MusicItem()
        for field in item.fields:
            try:
                item[field] = eval(field)
            except:
                print('Field is not defined', field)
        yield item
Esempio n. 6
0
 def parse(self, response):
     item_loader = ItemLoader(item=MusicItem(), response=response)
     item_loader.add_xpath(
         "url", "//ul[@class='listMusic']//div[@class='name']/a/@href")
     item_loader.add_xpath(
         "name", "//ul[@class='listMusic']//div[@class='name']/a/text()")
     print(item_loader.load_item())
Esempio n. 7
0
    def parse(self, response):
        for sel in response.xpath('//ul[@class="cont"]/li'):
		item = MusicItem()
		item ['artist'] = ''.join(sel.xpath('.//b/text()').extract())
		item ['trackName'] = ''.join(sel.xpath('.//strong/text()').extract())
		item ['trackUrl'] = urlparse.urljoin('http://www.viperial.cc',''.join(sel.xpath('a/@href').extract()))
		yield item
Esempio n. 8
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//ul[@class = "col5"]/li')
        items = []
        idex = 0
        for k in sites:
            idex += 1
            i = MusicItem()
            i['rank'] = int(k.select('span[@class = "green-num-box"]/text()').extract()[0])
            i['name'] = k.select('div/h3/a/text()|div/p/a/text()').extract()[0].split('(')[0].split('[')[0].split('<')[0]

            if idex < 11:
                a = k.select('div/p/text()').extract()[0]
                i['singer'] = a.split('/')[0][:-1]
                i['num'] = int(a.split('/')[1][1:-3])
            else:
                a = k.select('div/p/text()').extract()[1]
                i['singer'] = a.split('/')[0][17:-1]
                i['num'] = int(a.split('/')[1][1:-16])

            i['days'] = int(k.select('span[@class = "days"]/text()').extract()[0][3:-2])

            b = k.select('span[contains(@class, "trend")]/@class').extract()[0][-1]
            if b == "p":i['change'] = "up"
            elif b == "n":i['change'] = "down"
            else: i['change'] = "no"
        
            i['changedays'] = int(k.select('span[contains(@class,"trend")]/text()').extract()[0][1])
            items.append(i)

        return items
Esempio n. 9
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     i = MusicItem()
     
     i['comments'] = long(hxs.select('//span[@id="cnt_comment_count"]/text()').extract()[0])
     i['rank'] = response.meta['rank']
     i['name'] = response.meta['name']
     i['singer'] = response.meta['singer']
     
     return i
Esempio n. 10
0
File: wyy.py Progetto: chaofly/wyy
    def parse_song(self, response):
        json_dict = json.loads(response.text)
        if json_dict['total'] >= 1000 and len(json_dict['hotComments']) != 0:
            name_url = "https://music.163.com/song?id=" + response.meta[
                'song_id']
            name_text = requests.get(name_url, headers=self.headers).text

            singer_obj = re.match('.*?歌手:(.*?)。', name_text, re.S)
            ablum_obj = re.match('.*?所属专辑:(.*?)。', name_text, re.S)
            song_obj = re.match('.*?<title>(.*?)-', name_text, re.S)
            singer = singer_obj.group(1)
            ablum = ablum_obj.group(1)
            song_name = song_obj.group(1)
            print(json_dict['total'])
            for item in json_dict['hotComments']:
                music_item = MusicItem()
                item_loader = MusicItemLoader(item=MusicItem(),
                                              response=response)
                item_loader.add_value("comment_id", item["commentId"])
                item_loader.add_value("comment_date", item['time'])
                item_loader.add_value("singer", singer)
                item_loader.add_value("ablum", ablum)
                item_loader.add_value("song_name", song_name)
                item_loader.add_value("liked_count", item['likedCount'])
                item_loader.add_value("user_avatar_url",
                                      item['user']['avatarUrl'])
                item_loader.add_value("user_nickname",
                                      item['user']['nickname'])
                item_loader.add_value("comment", item['content'])
                item_loader.add_value("song_id", response.meta["song_id"])
                # discuss = []
                # if len(item['beReplied'])!=0:
                #     for comment in item['beReplied']:
                #         data = {
                #             "comment":comment['content'],
                #             "user_avatar_url":comment['user']['avatarUrl'],
                #             "user_nickname":comment['user']['nickname']
                #         }
                #         discuss.append(data)
                # item_loader.add_value("discuss",discuss)
                music_item = item_loader.load_item()
                yield music_item
Esempio n. 11
0
 def parse_singer(self,response):
     li_list = response.xpath('//ul[@id="m-artist-box"]/li')
     for li in li_list:
         item = MusicItem()
         name = li.xpath('./p/a[1]/text()|./a[1]/text()').extract_first()
         #https://music.163.com/#/artist/desc?id=28387245
         #https://music.163.com /artist?id=28387245
         url = 'https://music.163.com' + li.xpath('./p/a[1]/@href|./a[1]/@href').extract_first().split('?')[0].lstrip()+'/desc?'+li.xpath('./p/a[1]/@href|./a[1]/@href').extract_first().split('?')[1]
         item['name'] = name
         item['url'] = url
         yield scrapy.Request(url=url,callback=self.parse_desc,meta={'item':item},dont_filter=True)
Esempio n. 12
0
 def parse(self, response):
     for sel in response.xpath('//section[@class="latest-tracks"]'):
         item = MusicItem()
         item['artist'] = ''.join(
             sel.xpath('.//span[@class="artist"]/text()').extract())
         item['trackName'] = ''.join(
             sel.xpath('.//span[@class="name"]/text()').extract())
         item['trackUrl'] = urlparse.urljoin(
             'http://www.viperial.info', ''.join(
                 sel.xpath(
                     './/ul[@class="latest-tracks tracks-list clearfix"]//@href'
                 ).extract()))
         yield item
Esempio n. 13
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     sites = hxs.select('//ol/li')
     
     items = []
     for k in sites:
         i = MusicItem()
         i['rank'] = k.select('span[@class="list_no"]/text()').extract()[0]
         i['name'] = k.select('div[@class = "music_name"]/span/a/@title').extract()[0]
         i['singer'] = k.select('div[@class = "singer_name"]/span/a/text()').extract()[0]
         i['rise'] = k.select('span[@class = "exponent"]/text()').extract()[0]
         #i['num'] = k.select('div[@class="count"]/span/text()').extract()
         items.append(i)
     return items
Esempio n. 14
0
    def parse_item(self, response):
        item = MusicItem()
        print(response.url)
        mete = response.xpath('/html/head/meta[7]/@content').extract()
        print(mete)
        sid = re.findall(re.compile(r'id=([0-9]{1,20})'), response.url)
        name = re.findall(re.compile(r'歌手:(.*?)。'), "".join(mete))
        belong = re.findall(re.compile(r'所属专辑:(.*?)。'), "".join(mete))
        print(name, belong)

        item["song_id"] = "".join(sid)
        item["down_url"] = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format("".join(sid))

        item["name"] = "".join(name)
        item["belong"] = "".join(belong)

        return item
Esempio n. 15
0
    def parse_content(self, response):
        print("----------------------------2")
        selector = etree.HTML(response.text)

        song_list = selector.xpath('//div[@class="lrc_main"]/text()')

        song = []

        for line in song_list:
            song.append(line.strip())

        result = ', '.join(song)

        item = MusicItem()
        item['title'] = response.meta['title']
        item['song'] = result
        print(item)
        yield item
Esempio n. 16
0
    def parse_comment(self, response):
        id = response.meta['id']
        music = response.meta['music']
        artist = response.meta['artist']
        album = response.meta['album']
        result = json.loads(response.text)
        comments = []
        if 'hotComments' in result.keys():
            for comment in result.get('hotComments'):
                hotcomment_author = comment['user']['nickname']
                hotcomment = comment['content']
                hotcomment_like = comment['likedCount']
                # 这里我们将评论的作者头像也保存,如果大家喜欢这个项目,我后面可以做个web端的展现
                hotcomment_avatar = comment['user']['avatarUrl']
                data = {
                    'nickname': hotcomment_author,
                    'content': hotcomment,
                    'likedcount': hotcomment_like,
                    'avatarurl': hotcomment_avatar
                }
                comments.append(data)

        item = MusicItem()
        # 由于eval方法不稳定,具体的可以自己搜索,我们过滤一下错误
        # for field in item.fields:
        #     try:
        #         item[field] = eval(field)
        #     except:
        #         print('Field is not defined', field)
        item['id'] = id
        item['artist'] = artist
        item['album'] = album
        item['music'] = music
        item['comments'] = comments
        yield item

# def parse(self, response):
#     for each in response.xpath('//ul[@id="m-pl-container"]/li'):
#         item = MusicItem()
#         item['count'] = each.xpath('./div/div/span[2]/text()').extract()[0]
#         item['namess'] =each.xpath('./p[1]/a/text()').extract()[0]
#         yield item
Esempio n. 17
0
    def parse(self, response): 
        hxs=HtmlXPathSelector(response)
        sites = hxs.select('//tbody/tr')
        
        p = 0
        for k in sites:
            
            p += 1
            i = MusicItem()
            
            i['rank'] = int(k.select('td/div/span[contains(@class,"num")]/text()').extract()[0])
            if p < 4:
                url = k.select('td/div/div/div/span[@class="txt"]/strong/a/@href').extract()[0]
                i['name'] = k.select('td/div/div/div/span/strong/@title').extract()[0].split('(')[0].split('[')[0].split('\u3010')[0].split('<')[0]
            else:
                url = k.select('td/div/div/div/span[@class="txt"]/b/a/@href').extract()[0]
                i['name'] = k.select('td/div/div/div/span/b/@title').extract()[0].split('(')[0].split('[')[0].split('\u3010')[0].split('<')[0]
                
            i['singer'] = k.select('td/div[@class="text"]/span/@title').extract()[0]

            yield Request("http://music.163.com"+url, meta = {'rank':i['rank'],'name':i['name'],'singer':i['singer']}, callback=self.parse_item)
Esempio n. 18
0
from selenium.webdriver.support import expected_conditions as ec

from music.items import MusicItem

musics = list()
with open('./music_html', 'r+', encoding='utf-8') as f:
    html = f.readline()
    root = etree.HTML(html)
    names = root.xpath("//div[@class='f-cb']//b/@title")
    artists = root.xpath("//div[@class='text']/@title")
    print(names)
    print(artists)
    # with open('./music.csv', 'a', encoding='utf-8') as f:
    for i in range(len(names)):
        item = MusicItem()
        item['name'] = names[i]
        item['singer'] = artists[i]
        musics.append(item)
        # f.write(names[i]+"|"+artists[i])
        # f.write('\n')
option = webdriver.ChromeOptions()
# option.add_argument("--start-maximized")
# option.add_argument('--no-sandbox')  # ubuntu 需要这个参数
# option.add_argument("--incognito")
# option.add_argument("--headless")  # 不弹出浏览器
timeout = 600
# option.add_argument(
#     '--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"')
# option.add_argument('--')
browser = webdriver.Chrome(chrome_options=option)