def crawl_lyrics(singer): """逐页抓取歌词""" html = crawl_html(BASE_URL, params={'key': singer}, timeout=10) page_num = int(html.select('a.page-navigator-number')[-1].text) print('歌手 %s 共 %d 页' % (singer, page_num)) SINGER_DIR = os.path.join(BASE_DIR, singer) if not os.path.exists(SINGER_DIR): os.makedirs(SINGER_DIR) cnt = 1 for i in range(page_num): params = {'key': singer, 'start': i * 20} content_list = crawl_html(BASE_URL, params=params, timeout=10).select('li.bb') for content in content_list: try: title = content.find( 'span', class_='song-title').find('a').text.strip() author = content.find('span', class_='author_list')['title'].strip() if singer not in author: continue lyric = content.find( 'div', class_='lrc-content').find('p').text.strip() filename = title + ' - ' + author + ' - ' + '{0:0>4}'.format( cnt) + '.txt' filename = filename.replace('/', '-') openf(os.path.join(SINGER_DIR, filename), 'w').write(lyric) print(filename) cnt += 1 except: pass print('完成,第 %d 页' % (i + 1))
def find_artist_ids(): """只能拿到前100位的歌手ID""" url = 'http://music.163.com/api/artist/top?limit=100&offset=0' html = crawl_html(url, return_format='json', headers=headers) artists = html['artists'] with openf(BASE_DIR + 'artists.txt', 'w') as fa: for artist in artists: artist_name = artist['name'].strip().replace(" ", "_") fa.write(artist_name + ' ' + str(artist['id']) + '\n')
def crawl_content(link): """ Get the script content from a single page, http://transcripts.foreverdreaming.org/viewtopic.php?f=177&t=11508 """ html = crawl_html(BASE_URL + link[1:]).find('div', id='pagecontent') page_head = html.find('div', class_='boxheading').find('h2').text page_content = html.find('div', class_='postbody') p_list = page_content.find_all('p') all_ps = '\n'.join([p.text for p in p_list]) return all_ps, page_head
def crawl_lyrics(art_id): """抓取一整个歌手的所有歌词""" html = crawl_html(START_URL.format(art_id), headers=headers) # 先抓该歌手的专辑列表 artist = html.find('h2', id='artist-name').text.replace(' ', '_').strip() artist_dir = BASE_DIR + artist if not os.path.exists(artist_dir): # 歌手目录 os.makedirs(artist_dir) print("歌手名:", artist) albums = html.find('ul', class_='m-cvrlst').find_all('a', class_='msk') # 专辑列表 for album in albums: html = crawl_html(BASE_URL + album.get('href'), headers=headers) # 再抓取该专辑下歌曲列表 album_title = html.find('h2', class_='f-ff2').text.replace(' ', '_').replace( '/', '_').strip() # '/'会影响目录 album_dir = os.path.join(artist_dir, album_title) if not os.path.exists(album_dir): # 专辑目录 os.mkdir(album_dir) print(" " + artist + "---" + album_title) links = html.find('ul', class_='f-hide').find_all('a') # 歌曲列表 for link in links: song_name = link.text.replace(' ', '_').replace('/', '_').strip() song_id = link.get('href').split('=')[1] try: lyric_json = crawl_html(SONG_URL.format(song_id), return_format='json', headers=headers) # 抓取歌词 lyric_text = lyric_json['lrc']['lyric'] openf(os.path.join(album_dir, song_name + '.txt'), 'w').write(lyric_text) print(" " + song_name + ", URL: " + SONG_URL.format(song_id)) except: print(" " + song_name + ": 无歌词, URL: " + SONG_URL.format(song_id)) print()
def get_script(relative_link): tail = relative_link.split('/')[-1] print('Fetching %s' % tail) script_front_url = BASE_URL + quote(relative_link) try: front_html = crawl_html(script_front_url) script_link = front_html.find('p', align="center").a['href'] except: print('%s has no script :(' % tail) return None, None if script_link.endswith('.html'): title = script_link.split('/')[-1].split(' Script')[0] script_html = crawl_html(BASE_URL + script_link) script_text = script_html.find('td', {'class': "scrtext"}).get_text() script_text = clean_script(script_text) return title, script_text else: print('%s is a pdf :(' % tail) return None, None
def get_all_links(): """ Get all links from http://transcripts.foreverdreaming.org/viewtopic.php?f=159&t=8506 """ all_links = [] link_url = BASE_URL + '/viewtopic.php?f=159&t=8506' html = crawl_html(link_url) p_list = html.find('div', class_='postbody').find_all('p') for p in p_list: if '-' in p.text: all_links.append(p.find('a', class_='postlink')['href']) return all_links
def get_all_links(page_num): """ Get all links from http://transcripts.foreverdreaming.org/viewforum.php?f=177 """ all_links = [] start_url = BASE_URL + '/viewforum.php?f=177&start=' for i in range(page_num): html = crawl_html(start_url + str(i * 25)) tds = html.find_all('td', class_='topic-titles') for td in tds: href = td.find('a')['href'].split('&')[:2] all_links.append('&'.join(href)) return all_links
def crawl_imsdb(): start_url = 'http://www.imsdb.com/all%20scripts/' paragraphs = crawl_html(start_url).find_all('p') if not os.path.exists(SCRIPT_DIR): os.makedirs(SCRIPT_DIR) for p in paragraphs: relative_link = p.a['href'] title, script = get_script(relative_link) if not script: continue cur_filename = os.path.join(SCRIPT_DIR, title.strip('.html') + '.txt') with openf(cur_filename, 'w') as f: f.write(script)