コード例 #1
0
 def name_parse(self, response):
     body = response.body
     tag_a_list = Selector(text=body).xpath("//a[@class='nm nm-icn f-thide s-fc0']").extract()
     for tag in tag_a_list:
         self.singer_count += 1
         singer_name = Selector(text=tag).xpath("//a/text()").extract_first()
         singer_href = Selector(text=tag).xpath("//a/@href").extract_first()
         mark_index = singer_href.rindex('?')
         singer_href = singer_href[:mark_index] + '/album' + singer_href[mark_index:]
         sub_url = '/'.join([self.domain, singer_href])
         print u'正在爬取第[{}]个歌手[{}]的歌曲, url:{} ...'.format(self.singer_count, singer_name, singer_href)
         yield scrapy.Request(url=sub_url,
                              method="GET",
                              callback=self.singer_parse, 
                              meta={"language": response.meta['language']})
コード例 #2
0
 def parse(self, response):
     body = response.body
     type_list = Selector(text=body).xpath("//a[@class='cat-flag']").extract()
     for t in type_list:
         if u'华语' in t:
             sub_type = Selector(text=t).xpath("//a/text()").extract_first()
             type_href = Selector(text=t).xpath("//a/@href").extract_first()
             id = int(type_href[type_href.rindex('=')+1:])
             if not MusicLanguage.objects.filter(id=id).exists():
                 MusicLanguage.objects.create(id=id, name=sub_type)
             sub_url = '/'.join([self.domain, type_href])
             print u'正在爬取[{}]歌曲, url:{} ...'.format(sub_type, type_href)
             yield scrapy.Request(url=sub_url,
                                  method="GET",
                                  callback=self.spell_parse, meta={"language": id})
コード例 #3
0
 def music_parse(self, response):
     body = response.body
     tr_list = Selector(text=body).xpath("//table[@class='m-table']/tbody/tr").extract()
     for tr in tr_list:
         music_href = Selector(text=tr).xpath("//tr/td/div[@class='f-cb']/div/div/span/a/@href").extract_first()
         music_id = int(music_href[music_href.rindex('=')+1:])
         music_url = get_music_url(music_id)
         if music_url:
             music_name = Selector(text=tr).xpath("//tr/td/div[@class='f-cb']/div/div/span/a/b/@title").extract_first()
             music_duration = Selector(text=tr).xpath("//tr/td[@class='s-fc3']/span[@class='u-dur']/text()").extract_first()
             print u'正在爬取歌曲<<{}>>, url:{} ...'.format(music_name, music_url)
             if not Music.objects.filter(id=music_id).exists():
                 Music.objects.create(id=music_id, 
                                 name=music_name, 
                                 url=music_url,
                                 duration=music_duration,
                                 singer_id=response.meta['singer_id'],
                                 album_id=response.meta['album_id'])