def _fetch_from_lyricsdotcom(artist, song): page = requests.get('http://www.lyrics.com/%s-lyrics-%s.html' % (song, artist)) soup = BeautifulSoup(page.text, 'lxml') lyrics = str(soup.findAll('div', {'id' : 'lyrics'})) lyrics = mlstripper.strip_tags(lyrics) lyrics = lyrics.replace('\\n', '\n') # Re-encode new lines. return lyrics
def parse(self): ''' Parses this Billboard 200 page and returns scraped tuples of form: (album, artist, date, position) ''' soup = BeautifulSoup(self.read_page(), "lxml") #print ("Extracting ranks...") r_raw = soup.findAll("span", {"class" : "chart-row__current-week"}) ranks = [int(r.contents[0]) for r in r_raw] # tags -> int #print ("Extracting albums...") a_raw = soup.findAll("h2", {"class" : "chart-row__song"}) albums = [self.clean(str(a.contents[0])) for a in a_raw] #print ("Extracting artists...") a_raw = soup.findAll("h3", {"class" : "chart-row__artist"}) artists = [self.clean(mlstripper.strip_tags(str(a))) for a in a_raw] return [(self._date, z[0], z[1], z[2]) for z in zip(ranks, albums, artists)]