def name_parse(self, response): body = response.body tag_a_list = Selector(text=body).xpath("//a[@class='nm nm-icn f-thide s-fc0']").extract() for tag in tag_a_list: self.singer_count += 1 singer_name = Selector(text=tag).xpath("//a/text()").extract_first() singer_href = Selector(text=tag).xpath("//a/@href").extract_first() mark_index = singer_href.rindex('?') singer_href = singer_href[:mark_index] + '/album' + singer_href[mark_index:] sub_url = '/'.join([self.domain, singer_href]) print u'正在爬取第[{}]个歌手[{}]的歌曲, url:{} ...'.format(self.singer_count, singer_name, singer_href) yield scrapy.Request(url=sub_url, method="GET", callback=self.singer_parse, meta={"language": response.meta['language']})
def parse(self, response): body = response.body type_list = Selector(text=body).xpath("//a[@class='cat-flag']").extract() for t in type_list: if u'华语' in t: sub_type = Selector(text=t).xpath("//a/text()").extract_first() type_href = Selector(text=t).xpath("//a/@href").extract_first() id = int(type_href[type_href.rindex('=')+1:]) if not MusicLanguage.objects.filter(id=id).exists(): MusicLanguage.objects.create(id=id, name=sub_type) sub_url = '/'.join([self.domain, type_href]) print u'正在爬取[{}]歌曲, url:{} ...'.format(sub_type, type_href) yield scrapy.Request(url=sub_url, method="GET", callback=self.spell_parse, meta={"language": id})
def music_parse(self, response): body = response.body tr_list = Selector(text=body).xpath("//table[@class='m-table']/tbody/tr").extract() for tr in tr_list: music_href = Selector(text=tr).xpath("//tr/td/div[@class='f-cb']/div/div/span/a/@href").extract_first() music_id = int(music_href[music_href.rindex('=')+1:]) music_url = get_music_url(music_id) if music_url: music_name = Selector(text=tr).xpath("//tr/td/div[@class='f-cb']/div/div/span/a/b/@title").extract_first() music_duration = Selector(text=tr).xpath("//tr/td[@class='s-fc3']/span[@class='u-dur']/text()").extract_first() print u'正在爬取歌曲<<{}>>, url:{} ...'.format(music_name, music_url) if not Music.objects.filter(id=music_id).exists(): Music.objects.create(id=music_id, name=music_name, url=music_url, duration=music_duration, singer_id=response.meta['singer_id'], album_id=response.meta['album_id'])