コード例 #1
0
def crawl_lyrics(singer):
    """逐页抓取歌词"""
    html = crawl_html(BASE_URL, params={'key': singer}, timeout=10)
    page_num = int(html.select('a.page-navigator-number')[-1].text)
    print('歌手 %s 共 %d 页' % (singer, page_num))

    SINGER_DIR = os.path.join(BASE_DIR, singer)
    if not os.path.exists(SINGER_DIR):
        os.makedirs(SINGER_DIR)

    cnt = 1
    for i in range(page_num):
        params = {'key': singer, 'start': i * 20}
        content_list = crawl_html(BASE_URL, params=params,
                                  timeout=10).select('li.bb')
        for content in content_list:
            try:
                title = content.find(
                    'span', class_='song-title').find('a').text.strip()
                author = content.find('span',
                                      class_='author_list')['title'].strip()
                if singer not in author:
                    continue
                lyric = content.find(
                    'div', class_='lrc-content').find('p').text.strip()
                filename = title + ' - ' + author + ' - ' + '{0:0>4}'.format(
                    cnt) + '.txt'
                filename = filename.replace('/', '-')
                openf(os.path.join(SINGER_DIR, filename), 'w').write(lyric)
                print(filename)
                cnt += 1
            except:
                pass
        print('完成,第 %d 页' % (i + 1))
コード例 #2
0
def find_artist_ids():
    """只能拿到前100位的歌手ID"""
    url = 'http://music.163.com/api/artist/top?limit=100&offset=0'
    html = crawl_html(url, return_format='json', headers=headers)
    artists = html['artists']
    with openf(BASE_DIR + 'artists.txt', 'w') as fa:
        for artist in artists:
            artist_name = artist['name'].strip().replace(" ", "_")
            fa.write(artist_name + ' ' + str(artist['id']) + '\n')
コード例 #3
0
def crawl_content(link):
    """
    Get the script content from a single page,
    http://transcripts.foreverdreaming.org/viewtopic.php?f=177&t=11508
    """
    html = crawl_html(BASE_URL + link[1:]).find('div', id='pagecontent')
    page_head = html.find('div', class_='boxheading').find('h2').text
    page_content = html.find('div', class_='postbody')
    p_list = page_content.find_all('p')
    all_ps = '\n'.join([p.text for p in p_list])
    return all_ps, page_head
コード例 #4
0
def crawl_lyrics(art_id):
    """抓取一整个歌手的所有歌词"""
    html = crawl_html(START_URL.format(art_id), headers=headers)  # 先抓该歌手的专辑列表

    artist = html.find('h2', id='artist-name').text.replace(' ', '_').strip()
    artist_dir = BASE_DIR + artist
    if not os.path.exists(artist_dir):  # 歌手目录
        os.makedirs(artist_dir)
    print("歌手名:", artist)

    albums = html.find('ul', class_='m-cvrlst').find_all('a',
                                                         class_='msk')  # 专辑列表
    for album in albums:
        html = crawl_html(BASE_URL + album.get('href'),
                          headers=headers)  # 再抓取该专辑下歌曲列表

        album_title = html.find('h2',
                                class_='f-ff2').text.replace(' ', '_').replace(
                                    '/', '_').strip()  # '/'会影响目录
        album_dir = os.path.join(artist_dir, album_title)
        if not os.path.exists(album_dir):  # 专辑目录
            os.mkdir(album_dir)
        print("  " + artist + "---" + album_title)

        links = html.find('ul', class_='f-hide').find_all('a')  # 歌曲列表
        for link in links:
            song_name = link.text.replace(' ', '_').replace('/', '_').strip()
            song_id = link.get('href').split('=')[1]
            try:
                lyric_json = crawl_html(SONG_URL.format(song_id),
                                        return_format='json',
                                        headers=headers)  # 抓取歌词
                lyric_text = lyric_json['lrc']['lyric']
                openf(os.path.join(album_dir, song_name + '.txt'),
                      'w').write(lyric_text)
                print("    " + song_name + ", URL: " +
                      SONG_URL.format(song_id))
            except:
                print("    " + song_name + ": 无歌词, URL: " +
                      SONG_URL.format(song_id))
        print()
コード例 #5
0
def get_script(relative_link):
    tail = relative_link.split('/')[-1]
    print('Fetching %s' % tail)
    script_front_url = BASE_URL + quote(relative_link)

    try:
        front_html = crawl_html(script_front_url)
        script_link = front_html.find('p', align="center").a['href']
    except:
        print('%s has no script :(' % tail)
        return None, None

    if script_link.endswith('.html'):
        title = script_link.split('/')[-1].split(' Script')[0]
        script_html = crawl_html(BASE_URL + script_link)
        script_text = script_html.find('td', {'class': "scrtext"}).get_text()
        script_text = clean_script(script_text)
        return title, script_text
    else:
        print('%s is a pdf :(' % tail)
        return None, None
コード例 #6
0
def get_all_links():
    """
    Get all links from
    http://transcripts.foreverdreaming.org/viewtopic.php?f=159&t=8506
    """
    all_links = []
    link_url = BASE_URL + '/viewtopic.php?f=159&t=8506'
    html = crawl_html(link_url)
    p_list = html.find('div', class_='postbody').find_all('p')
    for p in p_list:
        if '-' in p.text:
            all_links.append(p.find('a', class_='postlink')['href'])
    return all_links
コード例 #7
0
def get_all_links(page_num):
    """
    Get all links from
    http://transcripts.foreverdreaming.org/viewforum.php?f=177
    """
    all_links = []
    start_url = BASE_URL + '/viewforum.php?f=177&start='

    for i in range(page_num):
        html = crawl_html(start_url + str(i * 25))
        tds = html.find_all('td', class_='topic-titles')
        for td in tds:
            href = td.find('a')['href'].split('&')[:2]
            all_links.append('&'.join(href))
    return all_links
コード例 #8
0
def crawl_imsdb():
    start_url = 'http://www.imsdb.com/all%20scripts/'
    paragraphs = crawl_html(start_url).find_all('p')

    if not os.path.exists(SCRIPT_DIR):
        os.makedirs(SCRIPT_DIR)

    for p in paragraphs:
        relative_link = p.a['href']
        title, script = get_script(relative_link)
        if not script:
            continue
        cur_filename = os.path.join(SCRIPT_DIR, title.strip('.html') + '.txt')
        with openf(cur_filename, 'w') as f:
            f.write(script)