def getSongs(artist_tuple, album_tuple): soup = BeautifulSoup(getHtml(WIKIA_DOMAIN + artist_tuple[1])) albums = soup.select('span.mw-headline') if not album_tuple[ 0]: # If album has no name and is in a "Other Songs" category for album in albums: if not album('a'): songs = album.parent elif not album_tuple[1]: # If album has a name, but no link to go to for album in albums: if album('a')[0].string == album_tuple[0]: songs = album.parent else: songs = soup.select('a[href=%s]' % album_tuple[1])[0].parent.parent while (type(songs) is not Tag) or (songs.name != 'ol' and songs.name != 'ul'): try: songs = songs.nextSibling if type(songs) is Tag and songs.name == 'h2': raise AttributeError except AttributeError: songs = None break ret = [] if songs: for song in songs('a'): if song['href'].find('action=edit') > -1: continue ret.append((song.string, song['href'])) return ret
def getArtists(): soup = BeautifulSoup( getHtml('http://www.uta-net.com/user/search_index/name.html')) artists = soup.select('.album td a') return [(artist.string, 'http://www.uta-net.com/user/search_index/' + artist['href']) for artist in artists]
def getSongs(artist_tuple, album_tuple): soup = BeautifulSoup(getHtml(DARKLYRICS_URL+album_tuple[1])) albums = soup.select('.album') for album in albums: if album.h2.strong.string[1:-1] == album_tuple[0]: return [(song.string, song['href'][3:-2]) for song in album.select('a')] return None
def getSongs(artist_tuple, album_tuple): soup = BeautifulSoup(getHtml(WIKIA_DOMAIN+artist_tuple[1])) albums = soup.select('span.mw-headline') if not album_tuple[0]: # If album has no name and is in a "Other Songs" category for album in albums: if not album('a'): songs = album.parent elif not album_tuple[1]: # If album has a name, but no link to go to for album in albums: if album('a')[0].string == album_tuple[0]: songs = album.parent else: songs = soup.select('a[href=%s]'%album_tuple[1])[0].parent.parent while (type(songs) is not Tag) or (songs.name != 'ol' and songs.name != 'ul'): try: songs = songs.nextSibling if type(songs) is Tag and songs.name == 'h2': raise AttributeError except AttributeError: songs = None break ret = [] if songs: for song in songs('a'): if song['href'].find('action=edit') > -1: continue ret.append((song.string, song['href'])) return ret
def getSongs(artist_tuple, album_tuple): soup = BeautifulSoup(getHtml(DARKLYRICS_URL + album_tuple[1])) albums = soup.select('.album') for album in albums: if album.h2.strong.string[1:-1] == album_tuple[0]: return [(song.string, song['href'][3:-2]) for song in album.select('a')] return None
def getLyrics(artist_tuple, album_tuple, song_tuple): soup = BeautifulSoup(getHtml(DARKLYRICS_URL+song_tuple[1])) songs = soup.select('.lyrics h3') for song in songs: if song.string[song.string.index(' ')+1:] == song_tuple[0]: lyrics = [] while song.next_sibling.name not in ['h3','div','a']: song = song.next_sibling lyrics += [song] return cleanLyricList(lyrics) return None
def getAlbums(artist_tuple): soup = BeautifulSoup(getHtml(artist_tuple[1])) albums = soup.select('.sidebar_right_2c a') ret = [] for album in albums: try: ret.append((album.img['title'], album['href'])) except Exception as e: # This just means its not an album pass # PASSU return ret
def getLyrics(artist_tuple, album_tuple, song_tuple): soup = BeautifulSoup(getHtml(DARKLYRICS_URL + song_tuple[1])) songs = soup.select('.lyrics h3') for song in songs: if song.string[song.string.index(' ') + 1:] == song_tuple[0]: lyrics = [] while song.next_sibling.name not in ['h3', 'div', 'a']: song = song.next_sibling lyrics += [song] return cleanLyricList(lyrics) return None
def getSongs(artist_tuple, album_tuple): soup = BeautifulSoup(getHtml(album_tuple[1]).decode('shift_jisx0213'), "html.parser") albums = soup.select('td.font_base_size_L strong') for album in albums: if album.string == album_tuple[0]: songs = album.find_parent("table").find_parent("table").select('.font_base_size a') for song in songs: subtags = song.select('*') for tag in subtags: tag.extract() return [(song.string, 'http://sp.uta-net.com/search/kashi.php?TID='+song['href'][song['href'].index('ID=')+3:]) for song in songs] return []
def getLyrics(artist_tuple, album_tuple, song_tuple): soup = BeautifulSoup(getHtml(WIKIA_DOMAIN+song_tuple[1])) lyricsdiv = soup.select('div.lyricbox')[0] if lyricsdiv.select('a[href=/Category:Instrumental]'): lyrics = "(Instrumental)" else: # Remove divs and crap from wiki lyrics box for div in lyricsdiv('div'): div.extract() comments = lyricsdiv.find_all(text=lambda text:isinstance(text, Comment)) for comment in comments: comment.extract() return unescape(cleanLyricList(lyricsdiv.contents))
def getArtists(): ret = [] # Artist pages indexed by start letter page_list = [chr(i) for i in range(97, 123)] + ['19'] for page in page_list: sys.stdout.write(page) sys.stdout.flush() soup = BeautifulSoup(getHtml(DARKLYRICS_URL + page + ".html")) artists = soup.select('.artists.fl a[href^=' + page + '/]') ret += [(artist.string.title(), artist['href']) for artist in artists] artists = soup.select('.artists.fr a[href^=' + page + '/]') ret += [(artist.string.title(), artist['href']) for artist in artists] # break return ret
def getArtists(): ret = [] # Artist pages indexed by start letter page_list = [chr(i) for i in range(97,123)] + ['19'] for page in page_list: sys.stdout.write(page) sys.stdout.flush() soup = BeautifulSoup(getHtml(DARKLYRICS_URL + page + ".html")) artists = soup.select('.artists.fl a[href^='+page+'/]') ret += [(artist.string.title(), artist['href']) for artist in artists] artists = soup.select('.artists.fr a[href^='+page+'/]') ret += [(artist.string.title(), artist['href']) for artist in artists] # break return ret
def getLyrics(artist_tuple, album_tuple, song_tuple): soup = BeautifulSoup(getHtml(WIKIA_DOMAIN + song_tuple[1])) lyricsdiv = soup.select('div.lyricbox')[0] if lyricsdiv.select('a[href=/Category:Instrumental]'): lyrics = "(Instrumental)" else: # Remove divs and crap from wiki lyrics box for div in lyricsdiv('div'): div.extract() comments = lyricsdiv.find_all( text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() return unescape(cleanLyricList(lyricsdiv.contents))
def getAlbums(artist_tuple): soup = BeautifulSoup(getHtml(WIKIA_DOMAIN+artist_tuple[1])) albums = soup.select('span.mw-headline') ret = [] for album in albums: album_url = None if album('a'): album_name = album('a')[0].string album_url = album('a')[0]['href'] else: album_name = None if album_url and album_url.find('action=edit') != -1: album_url = None ret.append((album_name, album_url)) return ret
def getAlbums(artist_tuple): soup = BeautifulSoup(getHtml(WIKIA_DOMAIN + artist_tuple[1])) albums = soup.select('span.mw-headline') ret = [] for album in albums: album_url = None if album('a'): album_name = album('a')[0].string album_url = album('a')[0]['href'] else: album_name = None if album_url and album_url.find('action=edit') != -1: album_url = None ret.append((album_name, album_url)) return ret
def scrape(): next_song = WIKI_URL d = datetime.datetime.now() timestamp = '{:%Y-%m-%d_%H:%M:%S}'.format(d) filename = 'lyricwiki_' + timestamp + '.json' while next_song: soup = BeautifulSoup(getHtml(next_song)) artists_a = soup.select('div#mw-pages')[0]('a') artists = [(artist.string, artist['href']) for artist in artists_a] sj = ScrapeJam(filename, 'lyricwiki_errs.log') sj.scrape(artists, getAlbums, getSongs, getLyrics) next_song = BASE_URL + soup.select('div#mw-pages')[0]('a')[0]['href'] if not 'pagefrom' in next_song: break
def getLyrics(artist_tuple, album_tuple, song_tuple): soup = BeautifulSoup(getHtml(song_tuple[1], False, 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30')) lyrics = soup.select('#kashi_main')[0] lyrics.div.extract() return cleanLyricList(lyrics.contents) songs = soup.select('.lyrics h3') for song in songs: if song.string[song.string.index(' ')+1:] == song_tuple[0]: lyrics = [] while song.next_sibling.name not in ['h3','div','a']: song = song.next_sibling lyrics += [song] return cleanLyricList(lyrics) return ""
def getSongs(artist_tuple, album_tuple): soup = BeautifulSoup( getHtml(album_tuple[1]).decode('shift_jisx0213'), "html.parser") albums = soup.select('td.font_base_size_L strong') for album in albums: if album.string == album_tuple[0]: songs = album.find_parent("table").find_parent("table").select( '.font_base_size a') for song in songs: subtags = song.select('*') for tag in subtags: tag.extract() return [ (song.string, 'http://sp.uta-net.com/search/kashi.php?TID=' + song['href'][song['href'].index('ID=') + 3:]) for song in songs ] return []
def getLyrics(artist_tuple, album_tuple, song_tuple): soup = BeautifulSoup( getHtml( song_tuple[1], False, 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30' )) lyrics = soup.select('#kashi_main')[0] lyrics.div.extract() return cleanLyricList(lyrics.contents) songs = soup.select('.lyrics h3') for song in songs: if song.string[song.string.index(' ') + 1:] == song_tuple[0]: lyrics = [] while song.next_sibling.name not in ['h3', 'div', 'a']: song = song.next_sibling lyrics += [song] return cleanLyricList(lyrics) return ""
def getAlbums(artist_tuple): soup = BeautifulSoup(getHtml(DARKLYRICS_URL+artist_tuple[1])) albums = soup.select('.album h2 strong') return [(album.string[1:-1], artist_tuple[1]) for album in albums]
def getArtists(): soup = BeautifulSoup(getHtml('http://www.uta-net.com/user/search_index/name.html')) artists = soup.select('.album td a') return [(artist.string, 'http://www.uta-net.com/user/search_index/'+artist['href']) for artist in artists]
def getAlbums(artist_tuple): soup = BeautifulSoup(getHtml(artist_tuple[1]).decode('shift_jisx0213'), "html.parser") albums = soup.select('td.font_base_size_L strong') return [(album.string, artist_tuple[1]) for album in albums]
def getAlbums(artist_tuple): soup = BeautifulSoup(getHtml(DARKLYRICS_URL + artist_tuple[1])) albums = soup.select('.album h2 strong') return [(album.string[1:-1], artist_tuple[1]) for album in albums]
def getSongs(artist_tuple, album_tuple): soup = BeautifulSoup(getHtml(NIGHTWISH_URL+album_tuple[1])) songs = soup.select('.box250 .textsmall li a') return [(song.string, song['href']) for song in songs]
def getLyrics(artist_tuple, album_tuple, song_tuple): soup = BeautifulSoup(getHtml(NIGHTWISH_URL + song_tuple[1])) lyricsoup = soup.select('.content_main_2c .text')[0] return cleanLyricList(lyricsoup.contents)
def getSongs(artist_tuple, album_tuple): soup = BeautifulSoup(getHtml(NIGHTWISH_URL + album_tuple[1])) songs = soup.select('.box250 .textsmall li a') return [(song.string, song['href']) for song in songs]
def getLyrics(artist_tuple, album_tuple, song_tuple): soup = BeautifulSoup(getHtml(NIGHTWISH_URL+song_tuple[1])) lyricsoup = soup.select('.content_main_2c .text')[0] return cleanLyricList(lyricsoup.contents)
def getAlbums(artist_tuple): soup = BeautifulSoup( getHtml(artist_tuple[1]).decode('shift_jisx0213'), "html.parser") albums = soup.select('td.font_base_size_L strong') return [(album.string, artist_tuple[1]) for album in albums]