def crawl_lyrics(singer): """逐页抓取歌词""" html = crawl_html(BASE_URL, params={'key': singer}, timeout=10) page_num = int(html.select('a.page-navigator-number')[-1].text) print('歌手 %s 共 %d 页' % (singer, page_num)) SINGER_DIR = os.path.join(BASE_DIR, singer) if not os.path.exists(SINGER_DIR): os.makedirs(SINGER_DIR) cnt = 1 for i in range(page_num): params = {'key': singer, 'start': i * 20} content_list = crawl_html(BASE_URL, params=params, timeout=10).select('li.bb') for content in content_list: try: title = content.find( 'span', class_='song-title').find('a').text.strip() author = content.find('span', class_='author_list')['title'].strip() if singer not in author: continue lyric = content.find( 'div', class_='lrc-content').find('p').text.strip() filename = title + ' - ' + author + ' - ' + '{0:0>4}'.format( cnt) + '.txt' filename = filename.replace('/', '-') openf(os.path.join(SINGER_DIR, filename), 'w').write(lyric) print(filename) cnt += 1 except: pass print('完成,第 %d 页' % (i + 1))
def find_artist_ids(): """只能拿到前100位的歌手ID""" url = 'http://music.163.com/api/artist/top?limit=100&offset=0' html = crawl_html(url, return_format='json', headers=headers) artists = html['artists'] with openf(BASE_DIR + 'artists.txt', 'w') as fa: for artist in artists: artist_name = artist['name'].strip().replace(" ", "_") fa.write(artist_name + ' ' + str(artist['id']) + '\n')
def crawl_lyrics(art_id): """抓取一整个歌手的所有歌词""" html = crawl_html(START_URL.format(art_id), headers=headers) # 先抓该歌手的专辑列表 artist = html.find('h2', id='artist-name').text.replace(' ', '_').strip() artist_dir = BASE_DIR + artist if not os.path.exists(artist_dir): # 歌手目录 os.makedirs(artist_dir) print("歌手名:", artist) albums = html.find('ul', class_='m-cvrlst').find_all('a', class_='msk') # 专辑列表 for album in albums: html = crawl_html(BASE_URL + album.get('href'), headers=headers) # 再抓取该专辑下歌曲列表 album_title = html.find('h2', class_='f-ff2').text.replace(' ', '_').replace( '/', '_').strip() # '/'会影响目录 album_dir = os.path.join(artist_dir, album_title) if not os.path.exists(album_dir): # 专辑目录 os.mkdir(album_dir) print(" " + artist + "---" + album_title) links = html.find('ul', class_='f-hide').find_all('a') # 歌曲列表 for link in links: song_name = link.text.replace(' ', '_').replace('/', '_').strip() song_id = link.get('href').split('=')[1] try: lyric_json = crawl_html(SONG_URL.format(song_id), return_format='json', headers=headers) # 抓取歌词 lyric_text = lyric_json['lrc']['lyric'] openf(os.path.join(album_dir, song_name + '.txt'), 'w').write(lyric_text) print(" " + song_name + ", URL: " + SONG_URL.format(song_id)) except: print(" " + song_name + ": 无歌词, URL: " + SONG_URL.format(song_id)) print()
def crawl_himym(): all_links = get_all_links(9) # 9 pages for HIMYM TV series print('Total links:', len(all_links)) if not os.path.exists(SCRIPT_DIR): os.makedirs(SCRIPT_DIR) for link in all_links: all_ps, page_head = crawl_content(link) with openf(os.path.join(SCRIPT_DIR, page_head + '.txt'), 'w') as f: f.write(all_ps + '\n') print('Finished: ' + page_head)
def crawl_tbbt(): all_links = get_all_links() print('Total links:', len(all_links)) if not os.path.exists(SCRIPT_DIR): os.makedirs(SCRIPT_DIR) for link in all_links: all_ps, page_head = crawl_content(link) page_head = page_head.replace('/', '_') with openf(os.path.join(SCRIPT_DIR, page_head + '.txt'), 'w') as f: f.write(all_ps + '\n') print('Finished: ' + page_head)
def crawl_imsdb(): start_url = 'http://www.imsdb.com/all%20scripts/' paragraphs = crawl_html(start_url).find_all('p') if not os.path.exists(SCRIPT_DIR): os.makedirs(SCRIPT_DIR) for p in paragraphs: relative_link = p.a['href'] title, script = get_script(relative_link) if not script: continue cur_filename = os.path.join(SCRIPT_DIR, title.strip('.html') + '.txt') with openf(cur_filename, 'w') as f: f.write(script)