Beispiel #1
0
def get_song_lyrics(parsed_url, url):
    """
    Takes in two urls--sometimes, with artist collaborations, the second url is
    not a relative URL, but a complete one. I use urlparse to join the two urls
    such that they are baseURL invariant.

    Returns the song lyrics, and prints "Success!" if successful.
    """
    if url.startswith(".."):
        url = urlparse.urljoin(parsed_url.geturl(), url)
    print "Parsing %s..." % (url)

    page      = urlz.open_and_read(url)
    soup      = BeautifulSoup(page, "lxml")
    page_text = soup.get_text()
    lyrics    = page_text[(page_text.index("Print") + 5):page_text.index("if  ( /Android")].strip("\n")

    if type(lyrics) == None:
        print "Nothing returned."
    elif len(lyrics) > 0:
        print "Success!"
    else:
        print "Lyrics are empty."

    return lyrics
Beispiel #2
0
def scrape_artist_for_song_urls(url):
    """
    Take in a url, opens it, and parses the page for song URLs and names.
    """
    try:
        parsed_url   = urlparse.urlparse(url)
        artist       = get_name_from_url(url)
        page         = urlz.open_and_read(url)
        soup         = BeautifulSoup(page, "lxml")

        songlist_tag = string.join(find_songlist_tag(soup).split("}];")[0].split("\n")[1:]).strip()
        json_arg     = "{%s}" % (songlist_tag.split("[", 1)[1].rsplit("]", 1)[0].lstrip(" "))
        song_list    = decode_json(json_arg)

        write_songs(artist, song_list, parsed_url)
    except:
        print "Skipping %s" % (url)