def albumList(query, arg='titles'): #returns a list of album titles or links to the album on rapgenius. albums = [] links = [] query = '-'.join(query.split()) url = "http://rapgenius.com/artists/%s" % query soup = pageopen.openPage(url) text = soup.find_all(class_="album_link") l = len(text) for i in range(0, l): raw = str(text[i]) raw = raw.replace('<a class="album_link" href="', '') raw = raw.replace('">', ',') raw = raw.replace('</a>', '') new = raw.split(',') album = new[1] albums.append(album) link = 'http://rapgenius.com%s' % new[0] links.append(link) if (arg == 'links'): return links else: return albums
def getAlbumData(artist, query): tracks = [] artistlink = '-'.join(artist.split()) query = '-'.join(query.split()) url = "http://rapgenius.com/albums/%s/%s" % (artistlink, query) soup = pageopen.openPage(url) meta = soup.find_all(property="og:description") text = soup.find_all(class_="song_name") meta = str(meta) meta = re.sub(r'\<.*?\"', '', meta) meta = re.sub(r'\" .*?\>', '', meta) tracks.append(meta) l = len(text) for i in range(0, l): trackNo = i+1 track = str(text[i]) track = re.sub(r'\<.*?\>', '', track) track = ' '.join(track.split()) track = track.split('– ')[1] track = track.split(' Lyrics')[0] tracks.append([trackNo, track]) return tracks
def getPopularSongs(artist, arg=None): popularSongs = [] links = [] artist = '-'.join(artist.split()) url = "http://rapgenius.com/artists/%s" % artist soup = pageopen.openPage(url) text = soup.find_all(class_="song_list") songs = str(text[0]) songs = songs.split('</li') for song in songs: l = re.search(r'\"\/.*?\"', song) if l != None: link = l.group(0) link = link.replace('"', '') links.append('http://rapgenius.com%s' % link) song = re.sub(r'\<.*?\>', '', song) song = song.replace('>', '').replace('&', '&') song = ' '.join(song.split()) popularSongs.append(song) popularSongs.remove('') if arg == 'link': return links else: return popularSongs
def getSongs(link): tracks = [] l = re.search(r';page\=.*?\;', link) l = l.group(0) l = l.replace(';page=', '').replace('&', '') l = int(l) for i in range (1, l): pageNo = ';page=%d&' % i url = re.sub(r';page\=.*?\;', '{', link) url = pageNo.join(url.split('{')) soup = pageopen.openPage(url) songs = soup.find_all(class_="song_list") songs = str(songs) songsList = songs.split('</li>') #print songs for song in songsList: song = ' '.join(song.split()) song = re.sub(r'\<.*?\>', '', song) song = song.replace('&', '&').replace('[', '') song = song.strip() if song != ']': tracks.append(song) return tracks
def searchSong(artist, query, arg='data'): data = [] lyrics = [] annotations = [] artist = '-'.join(artist.split()) query = '-'.join(query.split()) query = query.replace("'", '') query = artist +'-' + query + '-lyrics' url = "http://rapgenius.com/%s" % query soup = pageopen.openPage(url) text = soup.find_all(class_="lyrics") l = len(text) for i in range(0, l): words = str(text[i]) words = words.replace("<div", "{<div").replace('">', '">}') words = re.sub(r'\{.*?\}', '', words) words = words.replace('">}', '">') words = words.replace("<i>", '').replace("</i>", '') words = words.replace('<br/>', '') words = words.replace("<p>", '').replace("</p>", '') words = words.split("</a>") m = len(words) for j in range(0, m-1): a = re.search(r'\<.*?\>', words[j]) lyric = re.sub(r'\<.*?\>', '', words[j]) lyric = lyric.strip() #lyric = ' '.join(lyric.split()) annotation = a.group(0) lyrics.append(lyric) search1 = annotation.find('"no_annotation"') search2 = annotation.find('data-editorial-state') if search1 != -1: annotations.append("Not annotated") elif search2 != -1: annotation = re.sub(r'\<.*?\/', '', annotation) annotation = annotation.replace('">', '') annotations.append('http://rapgenius.com/' + annotation) m = len(lyrics) for i in range (0, m): data.append([lyrics[i], annotations[i]]) if arg == 'lyrics': return lyrics elif arg == 'link': return annotations else: return data
def searchWords(query, arg='data'): #initialize blank list in which to keep the data data = [] links = [] num = pagination.getTotalPages(query) for i in range(1, num + 1): url = "http://rapgenius.com/search?page=%d&q=%s" % (i, query) soup = pageopen.openPage(url) text = soup.find_all( class_="search_result" ) #finding everything in the class "search_result" l = len(text) for j in range(0, l): #forcing text into a string format, cleaning up the text coded = str(text[j]) #removing extra whitespace newcoded = ' '.join(coded.split()) #getting rid of all the commas that were already in there clean = newcoded.replace(',', '') #replacing all the extraneous HTML clean = clean.replace('<li class="search_result">', '') clean = clean.replace('<a class=" song_link" href="', '') clean = clean.replace('"> <span class="title_with_artists"> ', ',') clean = clean.replace('<em>', '').replace('</em>', '').replace( '</span>', '').replace('</a>', '').replace('</li>', '').replace('<br/>', ' ').replace('</p>', '') clean = clean.replace(' <p>', ',') clean = clean.replace(' – ', ',') clean = clean.replace('&', '&') #splitting into list results = clean.split(',') results[0] = 'http://rapgenius.com/%s' % results[0] results[2] = results[2].strip() data.append([results[1], results[2]]) links.append(results[0]) if arg == 'link': return links else: return data
def searchAnnotations(query): url = query soup = pageopen.openPage(url) text = soup.find_all(id="main") note = str(text) note = note.split("</div>")[1] note = note.split("<p><em>")[0] note = note.replace("<p>", '').replace("</p>", '').replace('<strong>', '').replace('</strong>','').replace('<em>', '"').replace('</em>', '"').replace("<blockquote>", '"').replace('</blockquote', '"') return note
def getSongs(link): data = [] tracks = [] links = [] l = re.search(r';page\=.*?\;', link) l = l.group(0) l = l.replace(';page=', '').replace('&', '') l = int(l) for i in range (1, l+1): pageNo = ';page=%d&' % i url = re.sub(r';page\=.*?\;', '{', link) url = pageNo.join(url.split('{')) soup = pageopen.openPage(url) songs = soup.find_all(class_="song_list") songs = str(songs) songsList = songs.split('</li>') if len(songsList) > 1: for song in songsList: song = ' '.join(song.split()) if song != '</ul>]': p = re.search(r'\/.*?\"', song) page = p.group(0) page = page.replace('"', '') page = 'http:%s' % page links.append(page) song = re.sub(r'\<.*?\>', '', song) song = song.replace('&', '&').replace('[', '') song = song.strip() if song != ']': tracks.append(song) l = len(tracks) for i in range(0, l): data.append([links[i], tracks[i]]) return data
def getSongs(link): data = [] tracks = [] links = [] l = re.search(r';page\=.*?\;', link) l = l.group(0) l = l.replace(';page=', '').replace('&', '') l = int(l) for i in range(1, l + 1): pageNo = ';page=%d&' % i url = re.sub(r';page\=.*?\;', '{', link) url = pageNo.join(url.split('{')) soup = pageopen.openPage(url) songs = soup.find_all(class_="song_list") songs = str(songs) songsList = songs.split('</li>') if len(songsList) > 1: for song in songsList: song = ' '.join(song.split()) if song != '</ul>]': p = re.search(r'\/.*?\"', song) page = p.group(0) page = page.replace('"', '') page = 'http://rapgenius.com%s' % page links.append(page) song = re.sub(r'\<.*?\>', '', song) song = song.replace('&', '&').replace('[', '') song = song.strip() if song != ']': tracks.append(song) l = len(tracks) for i in range(0, l): data.append([links[i], tracks[i]]) return data
def searchAnnotations(query): url = query soup = pageopen.openPage(url) text = soup.find_all(id="main") note = str(text) note = note.split("</div>")[1] note = note.split("<p><em>")[0] note = note.replace("<p>", '').replace("</p>", '').replace( '<strong>', '').replace('</strong>', '').replace('<em>', '"').replace( '</em>', '"').replace("<blockquote>", '"').replace('</blockquote', '"') return note
def getArtistBio(artist): artist = '-'.join(artist.split()) url = "http://rapgenius.com/artists/%s" % artist soup = pageopen.openPage(url) text = soup.find_all(property="og:description") l = len(text) for i in range(0, l): bio = str(text[i]) bio = re.sub(r'\<.*?\"', '', bio) bio = re.sub(r'\".*?\>', '', bio) bio = bio.replace('&', '&') return bio
def searchWords(query, arg='data'): #initialize blank list in which to keep the data data = [] links = [] num = pagination.getTotalPages(query) for i in range(1, 2): url = "http://rapgenius.com/search?page=%d&q=%s" % (i, query) soup = pageopen.openPage(url) text = soup.find_all(class_="search_result") #finding everything in the class "search_result" l = len(text) for j in range(0,l): #forcing text into a string format, cleaning up the text coded = str(text[j]) #removing extra whitespace newcoded = ' '.join(coded.split()) #getting rid of all the commas that were already in there clean = newcoded.replace(',', '') #replacing all the extraneous HTML clean = clean.replace('<li class="search_result">', '') clean = clean.replace('<a class=" song_link" href="', '') clean = clean.replace('"> <span class="title_with_artists"> ', ',') clean = clean.replace('<em>', '').replace('</em>', '').replace('</span>', '').replace('</a>', '').replace('</li>', '').replace('<br/>', ' ').replace('</p>', '') clean = clean.replace(' <p>', ',') clean = clean.replace(' – ', ',') clean = clean.replace('&', '&') #splitting into list results = clean.split(',') results[0] = 'http://rapgenius.com/%s' % results[0] results[2] = results[2].strip() data.append([results[1], results[2]]) links.append(results[0]) if arg == 'link': return links else: return data
def openPage(artist): artist = '-'.join(artist.split()) url = "http://genius.com/artists/%s" % artist soup = pageopen.openPage(url) pages = soup.find_all(class_="pagination") pages = str(pages) pages = pages.split('</a>') pageLink = pages[len(pages) - 3] pageLink = pageLink.replace('<a href="', 'http://rapgenius.com') pageLink = pageLink.split('"')[0] pageLink = pageLink.strip() return pageLink
def openPage(artist): artist = '-'.join(artist.split()) url = "http://rapgenius.com/artists/%s" % artist soup = pageopen.openPage(url) pages = soup.find_all(class_="pagination") pages = str(pages) pages = pages.split('</a>') pageLink = pages[len(pages) - 3] pageLink = pageLink.replace('<a href="', 'http://rapgenius.com') pageLink = pageLink.split('"')[0] pageLink = pageLink.strip() return pageLink
def getTotalPages(query): url = 'http://genius.com/search?q=%s' % query soup = pageopen.openPage(url) pages = soup.find_all(class_="pagination") pages = str(pages) pages = pages.split('</a>') pageLink = pages[len(pages) - 3] pageLink = pageLink.replace('<a href="', '') pageLink = pageLink.split('"')[0] pageLink = pageLink.strip() l = re.search(r'\/.*?\&', pageLink) l = l.group(0) l = l.replace('/search?page=', '').replace('&', '') l = int(l) return l
def getTotalPages(query): url = 'http://rapgenius.com/search?q=%s' % query soup = pageopen.openPage(url) pages = soup.find_all(class_="pagination") pages = str(pages) pages = pages.split('</a>') pageLink = pages[len(pages) - 3] pageLink = pageLink.replace('<a href="', '') pageLink = pageLink.split('"')[0] pageLink = pageLink.strip() l = re.search(r'\/.*?\&', pageLink) l = l.group(0) l = l.replace('/search?page=', '').replace('&', '') l = int(l) return l
def getDates(query): #returning album names and dates they were released. titles = albumList(query, 'titles') links = albumList(query, 'links') l = len(links) dates = [] info = [] for i in range(0, l): url = links[i] soup = pageopen.openPage(url) names = soup.find_all('h1', class_="name") k = len(names) for j in range(0, k): date = str(names[j]) date = ' '.join(date.split()) search1 = date.find('(1') search2 = date.find('(2') if search1 != -1: date = '1'+ date.split('(1')[1] #strip everything before the date date = date.split(')')[0] #stripping everything after the date dates.append(date) elif search2 != -1: date = '2'+ date.split('(2')[1] #strip everything before the date date = date.split(')')[0] #stripping everything after the date dates.append(date) else: dates.append('None') info.append([titles[i], dates[i]]) return info
def searchSong(artist, query, arg='data'): data = [] lyrics = [] annotations = [] artist = '-'.join(artist.split()) query = '-'.join(query.split()) query = query.replace("'", '') query = artist + '-' + query + '-lyrics' url = "http://rapgenius.com/%s" % query soup = pageopen.openPage(url) text = soup.find_all(class_="lyrics") l = len(text) for i in range(0, l): words = str(text[i]) words = words.replace("<div", "{<div").replace('">', '">}') words = re.sub(r'\{.*?\}', '', words) words = words.replace('">}', '">') words = words.replace("<i>", '').replace("</i>", '') words = words.replace('<br/>', '') words = words.replace("<p>", '').replace("</p>", '') words = words.split("</a>") m = len(words) for j in range(0, m - 1): a = re.search(r'\<.*?\>', words[j]) lyric = re.sub(r'\<.*?\>', '', words[j]) lyric = lyric.strip() #lyric = ' '.join(lyric.split()) annotation = a.group(0) #print lyric lyrics.append(lyric) search1 = annotation.find('"no_annotation"') search2 = annotation.find('data-editorial-state') if search1 != -1: annotations.append("Not annotated") elif search2 != -1: annotation = re.sub(r'\<.*?\/', '', annotation) annotation = annotation.replace('">', '') annotations.append('http://rapgenius.com/' + annotation) else: annotations.append("Not annotated") m = len(lyrics) for i in range(0, m): data.append([lyrics[i], annotations[i]]) if arg == 'lyrics': return lyrics elif arg == 'link': return annotations else: return data