Python descriptionTrim Exemples, scraperLibrary.descriptionTrim Python Exemples

Exemple #1

0

Afficher le fichier

            price = "Free!"
        else:
            price = re.findall("\$[0-9]+", datelong)[0]  #This extracts the ticket price
        descriptionparagraphs = list(bsObj.find("p", {"class":"description"}).next_siblings) # Gets the description.
        description = "" #Create an empty string
        counter = 0  #Need to create a loop in order to be able to use the .get_text thingie
        while (counter < len(descriptionparagraphs)):
            try: #The content (or lack thereof) of some paragraphs cause fatal errors
                description += descriptionparagraphs[counter].get_text() + " \u00A4 "  # Description may be split between multiple paragraphs.  A symbol is concatenated in case, say, the site lists each musician in a separate paragraph.
                counter += 1
            except:
                counter += 1
        description = re.sub('\s+',' ',description)
        description = description.replace("\u00A4 \u00A4","\u00A4") # In case symbol occurs two times in a row
        
        [description, readmore] = scraperLibrary.descriptionTrim(description, ["Watch Video","Visit Website"], 800, artistweb, newhtml)

        images = bsObj.findAll("img")
        artistpic = ""
        for oneimage in images: 
            if "photos" in oneimage.attrs["src"] and starttime != "22:00" and starttime != "23:00":  #Pulls photo from site IF not the late show (only want pics for one show per day)
                artistpic = "http://www.bluesalleylive.com" + oneimage.attrs["src"]
                break
        ticketweb = newhtml
        writer.writerow((dateonly, genre, artistpic, local, doors, price, starttime, newhtml, artist, venuelink, venuename, addressurl, venueaddress, description, readmore, musicurl, ticketweb))
        backupwriter.writerow((dateonly, genre, artistpic, local, doors, price, starttime, newhtml, artist, venuelink, venuename, addressurl, venueaddress, description, readmore, musicurl, ticketweb))
#        print(loopcounter)
csvFile.close()
backupCVS.close()

yesno = ("y","Y","n","N")

Exemple #2

0

Afficher le fichier

Fichier : gypsysallyscraper.py Projet : DavidSundland/python-web-scrapers

            continue
        localList = scraperLibrary.getLocalList()
        if scraperLibrary.compactWord(artist) in localList:
            local = "Yes"
        else:
            local = ""
        try:
            artistweb = bsObj.find("li", {"class":"web"}).find("a").attrs["href"]  #THIS finds the first instance of a li with a class of "web", then digs deeper, finding the first instance w/in that li of a child a, and pulls the href.  BUT - since some artists may not have link, using try/except
        except:
            artistweb = ""
        try: # There isn't always a description...
            description = bsObj.find("div", {"class":"bio"}).get_text() # Get the description, which does include a lot of breaks - will it be a mess?
        except:
            description = ""

        [description, readmore] = scraperLibrary.descriptionTrim(description, [], 800, artistweb, newhtml)

        descriptionJammed = description.replace(" ","") # Create a string with no spaces
        if len(re.findall("[A-Z]{15,}", descriptionJammed)) > 0:
            description = scraperLibrary.killCapAbuse(description)
        
        try:
            iframes = bsObj.findAll("iframe") # If there's a video, grab it and toss it into the "buy music" column.  BUT - skip iframes that don't contain youtubes
            for onei in iframes:  
                if "youtube" in onei.attrs["src"]:
                    musicurl = onei.attrs["src"]
                    break  # Once first video is found, move along (don't take back-up band's video over headliner; don't have 'else' overwrite found link)
                else:
                    musicurl = ""  # In case there are iframes, but no videos
        except:
            musicurl = ""

Exemple #3

0

Afficher le fichier

            ):  #don't need to re-list artist name in description
                continue
            if thetext.lower().startswith("vinyl lounge"):
                continue
            description += thetext + " "
        description = description.strip()
        description = re.sub('21\+\s[\/\-]*\s*\$[0-9]{,2}\s*(entry)*', '',
                             description)
        description = description.strip(
            "--")  # If description now leads w/ this, bye-bye

        [description,
         readmore] = scraperLibrary.descriptionTrim(description, [
             "ON SALE NOW!", "LiveNation and Songbyrd Present",
             "Songbyrd Presents", "Songbyrd Vinyl Lounge",
             "Songbyrd and Union Stage Present",
             "Latin Fluff and Songbyrd Present",
             "Songbyrd and LiveNation Present"
         ], 800, artistweb, newhtml)

        write1 = (date, genre, artistpic, local, doors, price, starttime,
                  newhtml, artist, venuelink, venuename, addressurl,
                  venueaddress, description, readmore, musicurl, ticketweb)
        write2 = (date, genre, artistpic, local, doors, price, starttime,
                  newhtml, artist, venuelink,
                  venuename, addressurl, venueaddress,
                  description.encode('UTF-8'), readmore, musicurl, ticketweb)
        write3 = (date, genre, artistpic, local, doors, price, starttime,
                  newhtml, artist.encode('UTF-8'), venuelink,
                  venuename, addressurl, venueaddress,
                  description.encode('UTF-8'), readmore, musicurl, ticketweb)

Exemple #4

0

Afficher le fichier

                "class": "artist_content"
            }).get_text().strip()  # Get the description.
        except:
            description = ""
        description = re.sub(
            '((Tickets\s)|(TICKETS\s))([gG][oO]\s)*((on[\s\-]sale\s)|(ON[\s\-]SALE\s))[A-Za-z]+\,*\s([0-9\/\-]{3,5}|([a-zA-Z]+)\s[0-9]{1,2})\s(\@|[aA][tT])\s(([0-9]{1,2}[aA][mM])|([nN][oO][oO][nN]))',
            '', description)
        description = description.replace("SUMMIT", "Summit")
        description = description.replace("DJ BASSCAMP PRESENTS",
                                          "DJ Basscamp Presents")
        description = description.replace("RESIDENT", "resident")

        [
            description, readmore
        ] = scraperLibrary.descriptionTrim(description, [
            "TICKETS ON SALE NOW",
            "FREE | EVERY SATURDAY NIGHT | MAIN ROOM (1ST FLOOR) | 21+ | 11:30 pm – close"
        ], 800, artistweb, newhtml)

        try:
            ticketurl = bsObj.find("div", {
                "class": "ticket_btn"
            }).find("a").attrs[
                "href"]  # Get the ticket sales URL; in a try/except in case tickets only at door or free
        except:
            print("Didn't find ticket sales for ", newhtml)
            ticketurl = ""
        musicurl = ""
        try:
            iframes = bsObj.findAll(
                "iframe"
            )  # If there's a video, grab it and toss it into the "buy music" column.  BUT - skip iframes that don't contain youtubes

Exemple #5

0

Afficher le fichier

Fichier : flashscraper.py Projet : DavidSundland/python-web-scrapers

                if gotartistlink == True and gotmusicurl == True:
                    break
        except:
            artistweb = ""
        description = ""
        for onepara in bsObj.findAll("p"):
            try:
                howaboutthis = onepara.get_text().strip()
                if howaboutthis.startswith("website") or howaboutthis.startswith("soundcloud") or howaboutthis.startswith("music |") or howaboutthis.startswith("resident advisor") or "645 Florida" in howaboutthis or "Copyright" in howaboutthis:
                    continue
                else:
                    description += howaboutthis + " "
            except:
                continue

        [description, readmore] = scraperLibrary.descriptionTrim(description, ["facebook","resident advisor","twitter","soundcloud"], 700, artistweb, newhtml)

        descriptionJammed = description.replace(" ","") # Create a string with no spaces
        if len(re.findall("[A-Z]{15,}", descriptionJammed)) > 0:
            description = scraperLibrary.killCapAbuse(description)

        description = re.sub('music\s+\|','',description)
        description = description.replace("|"," ")
        try:
            ticketweb =  bsObj.find("a", {"id":"hypTickets"}).attrs["href"]
        except:
            ticketweb = ""
        findthetime = bsObj.findAll("div", {"class":"col-12"})
        starttime = ""
        for onediv in findthetime:
            try:

Exemple #6

0

Afficher le fichier

        if "Open Mic" in artist or "Gordon Sterling" in artist:
            genre = "Potpourri"
            local = "Yes"
        try:
            artistweb = bsObj.find("li", {
                "class": "web"
            }).find("a").attrs["href"]
        except:
            artistweb = ""
        try:  # There isn't always a description...
            description = bsObj.find("div", {"class": "bio"}).get_text()
        except:
            description = ""

        [description, readmore] = scraperLibrary.descriptionTrim(
            description, [], 800, artistweb,
            newhtml)  #U Street gets shorter descriptions

        descriptionjammed = description.replace(
            " ", "")  # Create a string with no spaces
        descriptionJammed = description.replace(
            " ", "")  # Create a string with no spaces
        if len(re.findall("[A-Z]{15,}", descriptionJammed)) > 0:
            description = scraperLibrary.killCapAbuse(description)

        musicurl = ""
        try:
            iframes = bsObj.findAll(
                "iframe"
            )  # If there's a video, grab it and toss it into the "buy music" column.  BUT - skip iframes that don't contain youtubes
            for onei in iframes:

Exemple #7

0

Afficher le fichier

Fichier : dc9scraper.py Projet : DavidSundland/python-web-scrapers

     except:
         artistweb = ""
 try:
     description = bsObj.find("div", {
         "class": "bio"
     }).get_text().strip()
 except:
     description = ""
     print("Found no description")
 description = description.replace(
     "  / ", "").strip("/").strip().strip("/").strip()
 description = re.sub(
     '((Tickets\s)|(TICKETS\s))([gG][oO]\s)*((on[\s\-]sale\s)|(ON[\s\-]SALE\s))[A-Za-z]+\,*\s([0-9\/\-]{3,5}|([a-zA-Z]+)\s[0-9]{1,2})\s(\@|[aA][tT])\s(([0-9]{1,2}[aA][mM])|([nN][oO][oO][nN]))',
     '', description)
 [description, readmore] = scraperLibrary.descriptionTrim(
     description, ["OFFICIAL WEBSITE", "TWITTER", "FACEBOOK"], 800,
     artistweb, newhtml)
 try:
     musicurl = bsObj.find("li", {
         "class": "soundcloud"
     }).find("a").attrs["href"]
 except:
     try:
         musicurl = bsObj.find("li", {
             "class": "bandcamp"
         }).find("a").attrs["href"]
     except:
         try:
             iframes = eventObj.findAll(
                 "iframe"
             )  # If there's a video, grab it and toss it into the "buy music" column.  BUT - skip iframes that don't contain youtubes