price = "Free!" else: price = re.findall("\$[0-9]+", datelong)[0] #This extracts the ticket price descriptionparagraphs = list(bsObj.find("p", {"class":"description"}).next_siblings) # Gets the description. description = "" #Create an empty string counter = 0 #Need to create a loop in order to be able to use the .get_text thingie while (counter < len(descriptionparagraphs)): try: #The content (or lack thereof) of some paragraphs cause fatal errors description += descriptionparagraphs[counter].get_text() + " \u00A4 " # Description may be split between multiple paragraphs. A symbol is concatenated in case, say, the site lists each musician in a separate paragraph. counter += 1 except: counter += 1 description = re.sub('\s+',' ',description) description = description.replace("\u00A4 \u00A4","\u00A4") # In case symbol occurs two times in a row [description, readmore] = scraperLibrary.descriptionTrim(description, ["Watch Video","Visit Website"], 800, artistweb, newhtml) images = bsObj.findAll("img") artistpic = "" for oneimage in images: if "photos" in oneimage.attrs["src"] and starttime != "22:00" and starttime != "23:00": #Pulls photo from site IF not the late show (only want pics for one show per day) artistpic = "http://www.bluesalleylive.com" + oneimage.attrs["src"] break ticketweb = newhtml writer.writerow((dateonly, genre, artistpic, local, doors, price, starttime, newhtml, artist, venuelink, venuename, addressurl, venueaddress, description, readmore, musicurl, ticketweb)) backupwriter.writerow((dateonly, genre, artistpic, local, doors, price, starttime, newhtml, artist, venuelink, venuename, addressurl, venueaddress, description, readmore, musicurl, ticketweb)) # print(loopcounter) csvFile.close() backupCVS.close() yesno = ("y","Y","n","N")
continue localList = scraperLibrary.getLocalList() if scraperLibrary.compactWord(artist) in localList: local = "Yes" else: local = "" try: artistweb = bsObj.find("li", {"class":"web"}).find("a").attrs["href"] #THIS finds the first instance of a li with a class of "web", then digs deeper, finding the first instance w/in that li of a child a, and pulls the href. BUT - since some artists may not have link, using try/except except: artistweb = "" try: # There isn't always a description... description = bsObj.find("div", {"class":"bio"}).get_text() # Get the description, which does include a lot of breaks - will it be a mess? except: description = "" [description, readmore] = scraperLibrary.descriptionTrim(description, [], 800, artistweb, newhtml) descriptionJammed = description.replace(" ","") # Create a string with no spaces if len(re.findall("[A-Z]{15,}", descriptionJammed)) > 0: description = scraperLibrary.killCapAbuse(description) try: iframes = bsObj.findAll("iframe") # If there's a video, grab it and toss it into the "buy music" column. BUT - skip iframes that don't contain youtubes for onei in iframes: if "youtube" in onei.attrs["src"]: musicurl = onei.attrs["src"] break # Once first video is found, move along (don't take back-up band's video over headliner; don't have 'else' overwrite found link) else: musicurl = "" # In case there are iframes, but no videos except: musicurl = ""
): #don't need to re-list artist name in description continue if thetext.lower().startswith("vinyl lounge"): continue description += thetext + " " description = description.strip() description = re.sub('21\+\s[\/\-]*\s*\$[0-9]{,2}\s*(entry)*', '', description) description = description.strip( "--") # If description now leads w/ this, bye-bye [description, readmore] = scraperLibrary.descriptionTrim(description, [ "ON SALE NOW!", "LiveNation and Songbyrd Present", "Songbyrd Presents", "Songbyrd Vinyl Lounge", "Songbyrd and Union Stage Present", "Latin Fluff and Songbyrd Present", "Songbyrd and LiveNation Present" ], 800, artistweb, newhtml) write1 = (date, genre, artistpic, local, doors, price, starttime, newhtml, artist, venuelink, venuename, addressurl, venueaddress, description, readmore, musicurl, ticketweb) write2 = (date, genre, artistpic, local, doors, price, starttime, newhtml, artist, venuelink, venuename, addressurl, venueaddress, description.encode('UTF-8'), readmore, musicurl, ticketweb) write3 = (date, genre, artistpic, local, doors, price, starttime, newhtml, artist.encode('UTF-8'), venuelink, venuename, addressurl, venueaddress, description.encode('UTF-8'), readmore, musicurl, ticketweb)
"class": "artist_content" }).get_text().strip() # Get the description. except: description = "" description = re.sub( '((Tickets\s)|(TICKETS\s))([gG][oO]\s)*((on[\s\-]sale\s)|(ON[\s\-]SALE\s))[A-Za-z]+\,*\s([0-9\/\-]{3,5}|([a-zA-Z]+)\s[0-9]{1,2})\s(\@|[aA][tT])\s(([0-9]{1,2}[aA][mM])|([nN][oO][oO][nN]))', '', description) description = description.replace("SUMMIT", "Summit") description = description.replace("DJ BASSCAMP PRESENTS", "DJ Basscamp Presents") description = description.replace("RESIDENT", "resident") [ description, readmore ] = scraperLibrary.descriptionTrim(description, [ "TICKETS ON SALE NOW", "FREE | EVERY SATURDAY NIGHT | MAIN ROOM (1ST FLOOR) | 21+ | 11:30 pm – close" ], 800, artistweb, newhtml) try: ticketurl = bsObj.find("div", { "class": "ticket_btn" }).find("a").attrs[ "href"] # Get the ticket sales URL; in a try/except in case tickets only at door or free except: print("Didn't find ticket sales for ", newhtml) ticketurl = "" musicurl = "" try: iframes = bsObj.findAll( "iframe" ) # If there's a video, grab it and toss it into the "buy music" column. BUT - skip iframes that don't contain youtubes
if gotartistlink == True and gotmusicurl == True: break except: artistweb = "" description = "" for onepara in bsObj.findAll("p"): try: howaboutthis = onepara.get_text().strip() if howaboutthis.startswith("website") or howaboutthis.startswith("soundcloud") or howaboutthis.startswith("music |") or howaboutthis.startswith("resident advisor") or "645 Florida" in howaboutthis or "Copyright" in howaboutthis: continue else: description += howaboutthis + " " except: continue [description, readmore] = scraperLibrary.descriptionTrim(description, ["facebook","resident advisor","twitter","soundcloud"], 700, artistweb, newhtml) descriptionJammed = description.replace(" ","") # Create a string with no spaces if len(re.findall("[A-Z]{15,}", descriptionJammed)) > 0: description = scraperLibrary.killCapAbuse(description) description = re.sub('music\s+\|','',description) description = description.replace("|"," ") try: ticketweb = bsObj.find("a", {"id":"hypTickets"}).attrs["href"] except: ticketweb = "" findthetime = bsObj.findAll("div", {"class":"col-12"}) starttime = "" for onediv in findthetime: try:
if "Open Mic" in artist or "Gordon Sterling" in artist: genre = "Potpourri" local = "Yes" try: artistweb = bsObj.find("li", { "class": "web" }).find("a").attrs["href"] except: artistweb = "" try: # There isn't always a description... description = bsObj.find("div", {"class": "bio"}).get_text() except: description = "" [description, readmore] = scraperLibrary.descriptionTrim( description, [], 800, artistweb, newhtml) #U Street gets shorter descriptions descriptionjammed = description.replace( " ", "") # Create a string with no spaces descriptionJammed = description.replace( " ", "") # Create a string with no spaces if len(re.findall("[A-Z]{15,}", descriptionJammed)) > 0: description = scraperLibrary.killCapAbuse(description) musicurl = "" try: iframes = bsObj.findAll( "iframe" ) # If there's a video, grab it and toss it into the "buy music" column. BUT - skip iframes that don't contain youtubes for onei in iframes:
except: artistweb = "" try: description = bsObj.find("div", { "class": "bio" }).get_text().strip() except: description = "" print("Found no description") description = description.replace( " / ", "").strip("/").strip().strip("/").strip() description = re.sub( '((Tickets\s)|(TICKETS\s))([gG][oO]\s)*((on[\s\-]sale\s)|(ON[\s\-]SALE\s))[A-Za-z]+\,*\s([0-9\/\-]{3,5}|([a-zA-Z]+)\s[0-9]{1,2})\s(\@|[aA][tT])\s(([0-9]{1,2}[aA][mM])|([nN][oO][oO][nN]))', '', description) [description, readmore] = scraperLibrary.descriptionTrim( description, ["OFFICIAL WEBSITE", "TWITTER", "FACEBOOK"], 800, artistweb, newhtml) try: musicurl = bsObj.find("li", { "class": "soundcloud" }).find("a").attrs["href"] except: try: musicurl = bsObj.find("li", { "class": "bandcamp" }).find("a").attrs["href"] except: try: iframes = eventObj.findAll( "iframe" ) # If there's a video, grab it and toss it into the "buy music" column. BUT - skip iframes that don't contain youtubes