def downloadKWorbSpotifyYouTubeArtists(self, update=False): url = "https://kworb.net/youtube/archive.html" savename = "kworb_youtubeartists.p" if update is True: self.dutils.downloadArtistURL(url=url, savename=savename, force=True) bsdata = getHTML(savename) data = [] artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "youtube") print(artistDir) for table in bsdata.findAll("table"): ths = [th.text for th in table.findAll("th")] for tr in table.findAll("tr")[1:]: item = dict(zip(ths, tr.findAll("td"))) data.append(item) print(data) if False: bsdata = getHTML(savename) artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "youtube") for div in bsdata.findAll("div", {"class": "subcontainer"}): if div.find("span", {"class": "pagetitle"}) is None: continue for ref in div.findAll("a"): href = ref.attrs['href'] url = "{0}/{1}".format(self.youtubeURL, href) savename = "{0}/{1}".format(saveDir, href.replace(".html", ".p")) if isFile(savename): print("Y\t", savename, '\t', url) else: print("-\t", savename, '\t', url) #dbArtistsKWorb().dutils.downloadArtistURL(url=fullURL, savename=savename, force=True) for ifile in findExt(saveDir, ".p"): bsdata = getHTML(ifile) for table in bsdata.findAll("table"): trs = table.findAll("tr") for tr in trs[1:]: ref = tr.find("a") href = ref.attrs['href'] name = ref.text url = "{0}/{1}".format(self.youtubeURL, href) savename = "{0}/{1}".format( setDir(saveDir, "artist"), href.replace(".html", ".p")) print(url, savename) if isFile(savename) is False: data, code = downloadURL(url) from ioUtils import getFile, saveFile saveFile(idata=data, ifile=savename) sleep(3) break
def parseAndDownloadTeamYearlyStandings(self): files = findExt(self.getSeasonDir(), ext=".p", debug=False) for ifile in files: year = getBaseFilename(ifile) htmldata = getFile(ifile) bsdata = getHTML(htmldata) idVals = {} links = bsdata.findAll("a") for link in links: attrs = link.attrs if attrs.get("data-clubhouse-uid") is not None: href = attrs['href'] name = getBasename(href) idval = getBasename(getDirname(href)) if idVals.get(idval) is not None: if idVals[idval] != name: raise ValueError("Error in ID for this year!") idVals[idval] = name for idVal, name in idVals.items(): self.downloadTeamDataByYear(idVal, name, season=str(year), debug=True)
def parseBAFTACategoryData(self, ifile, category, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) data = {} done = False tables = bsdata.findAll("table", {"class": "wikitable"}) if debug: print(" Found {0} tables".format(len(tables))) for table in tables: if category == "Best_Direction": yeardata = self.parseBAFTADirectorData(table, category, debug=False) else: yeardata = self.parseBAFTAFilmData(table, category, debug=False) data = {**data, **yeardata} for year, yearData in data.items(): for category in yearData.keys(): data[year][category] = list(set(data[year][category])) return data
def parseRazziesCategoryData(self, ifile, category, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) data = {} done = False tables = bsdata.findAll("table", {"class": "wikitable"}) if debug: print(" Found {0} tables".format(len(tables))) for table in tables: caption = table.find("caption") if caption is None: yeardata = self.parseRazziesActingData(table, category, debug=False) data = {**data, **yeardata} else: yeardata = self.parseRazziesFilmData(table, category, debug=False) data = {**data, **yeardata} for year, yearData in data.items(): for category in yearData.keys(): data[year][category] = list(set(data[year][category])) return data
def parseFilmsiteYearlyData(self, ifile, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) movies = [] tables = bsdata.findAll("table") tables = tables[1:] for table in tables: trs = table.findAll("tr") trs = trs[1:] for tr in trs: tds = tr.findAll("td") if len(tds) == 2: mdata = tds[1].find("b") if mdata is not None: movie = mdata.text movie = "".join( [c for c in movie if ord(c) not in [10, 13]]) while movie.find(" ") != -1: movie = movie.replace(" ", " ") pos = movie.rfind("(") if pos != -1: movie = movie[:pos].strip() movies.append(movie) return movies
def processFlopsData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".html") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in files: htmldata = getFile(ifile) bsdata = getHTML(htmldata) tables = bsdata.findAll("table", {"class": "wikitable"}) for table in tables: trs = table.findAll("tr") try: ths = trs[0].findAll("th") ths = [x.text for x in ths] ths = [x.replace("\n", "") for x in ths] except: raise ValueError("Could not get headers") print(ths) for itr, tr in enumerate(trs[2:]): ths = tr.findAll("th") try: movie = ths[0].text movie = movie.replace("\n", "").strip() movie = movie.replace("[nb 2]", "") except: raise ValueError( "Could not find movie in {0}".format(ths)) tds = tr.findAll("td") try: year = tds[0].text year = int(year) except: raise ValueError( "Could not find year in {0}".format(tds)) print(year, '\t', movie) if yearlyData.get(year) is None: yearlyData[year] = [] yearlyData[year].append(movie) for year in sorted(yearlyData.keys()): movies[year] = [] for movie in yearlyData[year]: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of flops Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def parseBoxOfficeMojo(self, ifile, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) tbl = None for table in bsdata.findAll("table"): if tbl: break for tr in table.findAll("tr"): if len(tr) >= 10: tbl = table break else: break #print len(tbl) keys = [] data = [] for i, tr in enumerate(tbl): vals = [] if i == 0: for j, td in enumerate(tr.findAll("td")): for ref in td.findAll("a"): key = ref.string keys.append(key) else: if len(tr) <= 1: continue #print "\n\n\nNext...." #print tr #print " tr-->",tr,'\t',len(tr) #print i,tr,len(data) for j, td in enumerate(tr.findAll("td")): if td.string == None: continue try: if re.search("TOTAL \((\d+) MOVIES\)", td.string): break except: print(j, td.string) raise () key = keys[j] val = td.string vals.append(val) #print j,'\t',keys[j],'\t',td.string if len(vals) == 0: break if len(vals) != len(keys): print("Mismatch with keys/data") print(len(keys), '\t', keys) print(len(vals), '\t', vals) raise ("YO") break else: data.append(vals) if debug: print("Found", len(data), "movies from", ifile) return data
def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movieData = OrderedDict() for ifile in sorted(files): #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p" htmldata = getFile(ifile) bsdata = getHTML(htmldata) year = getBaseFilename(ifile) data = {} done = False tables = bsdata.findAll("table") #, {"id": "table_3"}) movies = {} for it, table in enumerate(tables): ths = table.findAll("th") trs = table.findAll("tr") for itr, tr in enumerate(trs): tds = tr.findAll("td") if len(tds) == 11: val = removeTag(tds[1], 'span') film = val.text film = film.replace(" ({0})".format(year), "") try: rank = float(tds[-1].text) except: try: rank = float(tds[-2].text) except: raise ValueError(tds[-1], tds[-2], tr) movies[film] = rank movieData[year] = movies yearlyData = {} for year in sorted(movieData.keys()): yearlyData[year] = sorted(movieData[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format( len(yearlyData), savename)) saveFile(savename, yearlyData)
def downloadMainArtists(self, force=False, debug=False, sleeptime=2): savename = self.getMainSavename() ## Parse data bsdata = getHTML(savename) artistDB = {} ## Find and Download Artists categories = bsdata.find("div", {"class": "sidebar-widget widget_categories"}) if categories is None: raise ValueError("Cannot find categories!") uls = categories.findAll("ul") for ul in uls: lis = ul.findAll("li") for i, li in enumerate(lis): try: catitem = li.attrs["class"][1] except: raise ValueError( "Cannot find list class item: {0}".format(li)) ref = li.find("a") if ref is None: raise ValueError("Cannot find list link!") try: href = ref.attrs['href'] except: raise ValueError("Cannot find list href!") # check for artist artistName = href.split('/')[-2] try: int(artistName) continue except: if artistName.find("parent-category-ii") == -1: pass else: continue # get artist ID artistID = catitem.split('-')[-1] try: int(artistID) except: continue if force is False and isFile(savename): print("{0} exists.".format(savename)) continue url = href savename = self.getArtistSavename(artistID) print(i, '\t', artistID, '\t', artistName, '\t', savename) self.downloadArtistURL(url=url, savename=savename, parse=False)
def parseRottenTomatoesFile(self, ifile, debug=False): movies = {} if debug: print("Parsing {0}".format(ifile)) htmldata = getFile(ifile) bsdata = getHTML(htmldata) table = bsdata.find("table", {"class": "table"}) if table: keys = [] for tr in table.findAll("tr"): if len(keys) == 0: for th in tr.findAll("th"): key = th.string if key == None: key = " ".join( [x.string for x in th.findAll("span")]) keys.append(key) #print key else: line = [] for i, td in enumerate(tr.findAll("td")): #print i,'\t',td if i == 0 or i == 3: val = td.string if i == 1: for span in td.findAll("span"): if span.string: val = span.string break if i == 2: ref = td.find("a") #link = ref.attrs["href"] val = ref.string val = val.strip() line.append(val) #print i,'\t',val.strip() movie = line[2] rating = line[1] rating = rating.replace("%", "") rating = int(rating) retval = re.search("\((\d+)\)", movie) if retval: year = retval.group() movie = movie.replace(year, "").strip() year = retval.groups()[0] #retval = search(r'(%d+)', movie) if movies.get(year) == None: movies[year] = {} movies[year][movie] = rating #print year,'\t',rating,'\t',movie return movies
def parseSearchArtist(self, artist, data, maxArtists=99, force=False, debug=False): if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = {} uls = bsdata.findAll("ul", {"class": "search-results"}) for ul in uls: lis = ul.findAll("li", {"class": "artist"}) for li in lis: divs = li.findAll("div", {"class": "name"}) for div in divs: link = div.find("a") href = link.attrs['href'] tooltip = link.attrs['data-tooltip'] try: from json import loads tooltip = loads(tooltip) artistID = tooltip['id'] except: artistID = None if artistDB.get(href) is None: artistDB[href] = {"N": 0, "Name": artist} artistDB[href]["N"] += 1 if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 for href, hrefData in artistDB.items(): iArtist += 1 if iArtist > maxArtists: break discID = self.dutils.getArtistID(href) url = self.getArtistURL(href) savename = self.dutils.getArtistSavename(discID) print(iArtist, '/', len(artistDB), '\t:', discID, '\t', url) if isFile(savename): if force is False: continue self.dutils.downloadArtistURL(url, savename, force=force)
def parseSearchArtist(self, artist, data, maxArtists=99, force=False, debug=False): return if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = {} for div in bsdata.findAll("div", {"class": "section"}): refs = div.findAll("a") for ref in refs: if ref.find("img") is not None: continue href = ref.attrs['href'] artist = ref.text if href.startswith("/artist/") is False: continue #print(artist,"\t",href) if artistDB.get(href) is None: artistDB[href] = {"N": 0, "Name": artist} artistDB[href]["N"] += 1 if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 for href, hrefData in artistDB.items(): iArtist += 1 if iArtist > maxArtists: break discID = self.dutils.getArtistID(href) url = self.getArtistURL(href) savename = self.dutils.getArtistSavename(discID) print(iArtist, '/', len(artistDB), '\t:', discID, '\t', url) #continue if isFile(savename): if force is False: continue self.dutils.downloadArtistURL(url, savename, force=force)
def parseFilms101YearlyData(self, ifile, debug=False): if debug: print(ifile) htmldata = getFile(ifile) bsdata = getHTML(htmldata) movies = [] headertables = bsdata.findAll("table", {"class": "lsthdg"}) datatables = bsdata.findAll("table", {"class": "lstdta"}) if len(headertables) < len(datatables): print(headertables) raise ValueError("Found {0} headers and {1} data tables".format(len(headertables), len(datatables))) if debug: print("Found {0} tables".format(len(datatables))) for i in range(len(datatables)): headertable = headertables[i] tds = headertable.findAll("td") headers = [x.text for x in tds if x is not None] headers = [x.strip() for x in headers] datatable = datatables[i] trs = datatable.findAll("tr") expect = len(trs) for tr in trs: tds = tr.findAll("td") tds = [x.text for x in tds if x is not None] if len(tds) != len(headers): print(headers) print(tds) 1/0 try: mdata = dict(zip(headers, tds)) except: print(headers) print(tds) raise ValueError("Could not combine headers and data") try: movie = mdata['TITLE'] except: raise ValueError("Could not get movie name from TITLE key! {0}".format(mdata)) movies.append(movie) if debug: print("Found {0}/{1} movies".format(len(movies), expect)) return movies
def getDataBase(self, inputdata): if isinstance(inputdata, dict): self.bsdata = inputdata return elif fileUtil(inputdata).isFile(): ## Assumes str I believe ioData = fileIO().get(inputdata) if isinstance(ioData, artistDBDataClass): self.dbdata = ioData return elif isinstance(ioData, str): try: self.bsdata = getHTML(ioData) except: raise ValueError( "Cannot read artist [str] file: {0}".format(inputdata)) return elif isinstance(ioData, bytes): try: self.bsdata = getHTML(ioData) except: raise ValueError( "Cannot read artist [bytes] file: {0}".format( inputdata)) return elif isBS4(ioData): self.bsdata = ioData return elif isinstance(ioData, dict): self.bsdata = ioData return else: raise ValueError("Not sure about file data type: {0}".format( type(ioData))) else: raise ValueError("Not sure about input type: {0}".format( type(inputdata))) return
def downloadUnknownArtistCompositions(self): newIgnores = [] for modVal, modValMetadata in self.metadata.items(): N = len(modValMetadata) ts = timestat( "Downloading {0} Unknown Composition Files For ModVal={1}". format(N, modVal)) for i, (artistID, artistIDData) in enumerate(modValMetadata.items()): savename = self.dutils.getArtistSavename(artistID, song=True) href = artistIDData["URL"] artist = artistIDData["Name"] if isFile(savename): continue ## Replace /credits with /songs href = "/".join(href.split('/')[:-1] + ["songs", "all"]) ## Create Full URL url = urllib.parse.urljoin(self.dbArtists.baseURL, href) print("\n") print("=" * 100) print("{0}/{1}: [{2}] / [{3}]".format(i, N, artist, url)) data, response = self.dutils.downloadURL(url) if response == 200: bsdata = getHTML(data) if len(bsdata.findAll("th", {"class": "title-composer"})) > 0: print(" ---> Saving Data To {0}".format(savename)) saveFile(idata=data, ifile=savename) sleep(3) continue sleep(3) newIgnores.append(artistID) if i == 20: break ts.stop() print("New IDs To Ignore") print(newIgnores) tsUpdate = timestat( "Adding {0} ArtistIDs To Master Composition Ignore List".format( len(newIgnores))) self.updateMasterIgnoreCompositionData(newIgnores) tsUpdate.stop()
def getData(self, inputdata): if isinstance(inputdata, str): if isFile(inputdata): try: bsdata = getHTML(getFile(inputdata)) except: try: bsdata = getHTML(getFile(inputdata, version=2)) except: raise ValueError("Cannot read artist file: {0}".format(inputdata)) else: try: bsdata = getHTML(inputdata) except: raise ValueError("Not sure about string input: {0} . It is not a file".format(inputdata)) elif isBS4(inputdata): bsdata = inputdata pass else: raise ValueError("Not sure about input type: {0}".format(type(inputdata))) self.bsdata = bsdata return self.parse()
def parseSearchArtist(self, artist, data, maxArtists=99, force=False, debug=False): if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = {} tables = bsdata.findAll("table") for table in tables: ths = table.findAll("th") headers = [x.text for x in ths] trs = table.findAll("tr") for tr in trs[1:]: tds = tr.findAll("td") name = tds[0].find('a').text href = tds[0].find('a').attrs['href'] if artistDB.get(href) is None: artistDB[href] = {"N": 0, "Name": name} artistDB[href]["N"] += 1 if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 iDown = 0 for href, hrefData in artistDB.items(): if iDown > 20: break iArtist += 1 if iArtist > maxArtists: break discID = self.dutils.getArtistID(href) url = self.getArtistURL(href) savename = self.dutils.getArtistSavename(discID) print(iArtist,'/',len(artistDB),'\t:',discID,'\t',url) if isFile(savename): if force is False: continue iDown += 1 self.dutils.downloadArtistURL(url, savename, force=force)
def parseSearchArtist(self, artist, data): if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = {} songs = bsdata.findAll("article", {"class": "bgl0"}) + bsdata.findAll( "article", {"class": "bgl1"}) for i, song in enumerate(songs): label = song.find("label") if label is None: continue name = label.text ref = song.find("a").attrs['href'] artistURL = "/".join(ref.split("/")[:2]) #print(name,'\t',url,'\t',artistID) if artistDB.get(artistURL) is None: artistDB[artistURL] = {"N": 0, "Name": name} artistDB[artistURL]["N"] += 1 if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 for href, hrefData in artistDB.items(): iArtist += 1 name = hrefData["Name"] discID = self.dutils.getArtistID(name) url = self.getArtistURL(href) savename = self.getArtistSavename(discID) print(iArtist, '/', len(artistDB), '\t:', discID, '\t', url) if isFile(savename): continue self.dutils.downloadArtistURL(url, savename)
def downloadKWorbSpotifyArtists(self, update=False): url = "https://kworb.net/spotify/artists.html" savename = "kworb_spotifyartists.p" if update is True: self.dutils.downloadArtistURL(url=url, savename=savename, force=True) bsdata = getHTML(savename) data = [] artistDir = self.disc.getArtistsDir() saveDir = setDir(artistDir, "data") print(artistDir) for table in bsdata.findAll("table"): ths = [th.text for th in table.findAll("th")] for tr in table.findAll("tr")[1:]: item = dict(zip(ths, tr.findAll("td"))) data.append(item) print("Found {0} Spotify Artists".format(len(data))) for i, item in enumerate(data): info = item["Artist"] url = info.find('a').attrs['href'] name = info.find('a').text savename = setFile(saveDir, "{0}.p".format(getBaseFilename(url))) if isFile(savename): continue print("Y\t", savename, '\t', url, '\t', name) else: fullURL = "{0}/{1}".format(self.spotifyURL, url) print("{0}/{1}".format(i, len(data)), "\t-\t", savename, '\t', fullURL, '\t', name) try: self.dutils.downloadArtistURL(url=fullURL, savename=savename, force=True) except: print(" ---> Error") sleep(1)
def parseSearchArtist(self, artist, data): if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = {} descrs = bsdata.findAll("div", {"class": "listingDescription"}) for descr in descrs: refs = descr.findAll("a", {"class": "listingTitle"}) for ref in refs: url = ref.attrs['href'] fullurl = "/".join(url.split("/")[:-4]) fullurl = "{0}/artist".format(fullurl) artistID = self.dutils.getArtistID(fullurl) if artistDB.get(fullurl) is None: artistDB[fullurl] = {"N": 0, "ID": artistID} artistDB[fullurl]["N"] += 1 if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 for href, hrefData in artistDB.items(): iArtist += 1 discID = hrefData["ID"] url = href savename = self.getArtistSavename(discID) print(iArtist, '/', len(artistDB), '\t:', discID, '\t', url, '\t', savename) if isFile(savename): continue self.downloadArtistURL(url, savename)
def processWikipediaYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movies = OrderedDict() for ifile in files: if debug: print("Processing {0}".format(ifile)) year = getBaseFilename(ifile) #if year == "1985": continue htmldata = getFile(ifile) bsdata = getHTML(htmldata) results = self.parseWikipediaOscarData(ifile, debug=False) if len(results) == 0: results = self.parseWikipediaOscarDataSpecial(ifile, debug=debug) if len(results) == 0: raise ValueError("No results for {0}".format(ifile)) for k, v in results.items(): print("====>", year, '\t', k) print(" Winner :", results[k]["Winner"]) if debug: print(" Nominees:", results[k]["Nominees"]) print("") savename = setFile(self.getResultsDir(), "{0}.json".format(year)) print("Saving {0} wikipedia oscar data to {1}".format( year, savename)) saveFile(savename, results)
def processRollingStoneData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".html") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in files: htmldata = getFile(ifile) bsdata = getHTML(htmldata) h3s = bsdata.findAll("h3", {"class": "c-list__title t-bold"}) h3s = [x.text for x in h3s] h3s = [x.replace("\n", "").strip() for x in h3s] for h3 in h3s: try: year = int(h3[-5:-1]) except: raise ValueError("Could not get year from {0}".format(h3)) movie = h3[1:-8] print(year, '\t', movie) if yearlyData.get(year) is None: yearlyData[year] = [] yearlyData[year].append(movie) for year in sorted(yearlyData.keys()): movies[year] = [] for movie in yearlyData[year]: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of rollingstone Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def parseSearchArtist(self, artist, data): if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = [] contentdivs = bsdata.findAll("div", {"class": "contentItem"}) for i,contentdiv in enumerate(contentdivs): artistDiv = contentdiv.find("div", {"class": "artist"}) if artistDiv is None: continue artistName = artistDiv.text albumDiv = contentdiv.find("div", {"class": "title"}) if albumDiv is None: continue albumName = albumDiv.text try: albumURL = albumDiv.find("a").attrs['href'] except: albumURL = None artistDB.append({"ArtistName": artistName, "AlbumName": albumName, "AlbumURL": albumURL}) artistID = self.dutils.getArtistID(artist) page = 1 savename = self.getArtistSavename(artistID, page) while isFile(savename): page += 1 savename = self.getArtistSavename(artistID, page) print("Saving {0} new artist media to {1}".format(len(artistDB), savename)) saveFile(idata=artistDB, ifile=savename)
def parseSearchArtist(self, artist, data, maxArtists=99, force=False, debug=False): if data is None: print("Data is None!") return None ## Parse data bsdata = getHTML(data) jdata = None for script in bsdata.findAll("script"): if len(script.contents) == 0: continue if script.contents[0].startswith("window.__DZR_APP_STATE__ = "): try: jdata = json.loads(script.contents[0].replace("window.__DZR_APP_STATE__ = ", "")) except: continue if jdata is not None: break if jdata is None: print("Could not find JSON data in search results") return artistDB = {} for result in jdata["TOP_RESULT"]: try: artistID = result["ART_ID"] artistName = result["ART_NAME"] except: print("No data in {0}".format(result)) continue url = self.getArtistURL(artistID) if artistDB.get(url) is None: artistDB[url] = {"N": 0, "Name": artistName, "ID": artistID} artistDB[url]["N"] += 1 if self.debug: print("[{0}] , [{1}]".format(artistID, artistName)) for result in jdata["ARTIST"]['data']: try: artistID = result["ART_ID"] artistName = result["ART_NAME"] except: print("No data in {0}".format(result)) continue url = self.getArtistURL(artistID) if artistDB.get(url) is None: artistDB[url] = {"N": 0, "Name": artistName, "ID": artistID} artistDB[url]["N"] += 1 if self.debug: print("[{0}] , [{1}]".format(artistID, artistName)) if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 for url, hrefData in artistDB.items(): iArtist += 1 if iArtist > maxArtists: break artistID = hrefData["ID"] savename = self.dutils.getArtistSavename(artistID) name = hrefData['Name'] print(iArtist,'/',len(artistDB),'\t:',artistID,'\t',name,'\t',savename) if isFile(savename): if force is False: continue self.dutils.downloadArtistURL(url, savename, force=force)
def parseTeamYearlyStandings(self, startYear=2003, endYear=2018, debug=False, verydebug=False): for year in range(startYear, endYear + 1): seasonDir = self.getYearlySeasonDir(year) files = findExt(seasonDir, ext=".p", debug=False) seasonData = season(year) for ifile in files: nameyear = getBaseFilename(ifile) htmldata = getFile(ifile) bsdata = getHTML(htmldata) teamName = nameyear.replace("-{0}".format(year), "") metadata = bsdata.find("meta", {"property": "og:url"}) if metadata is None: raise ValueError( "Could not find basic team meta data for this file! {0}" .format(ifile)) try: content = metadata.attrs['content'] year = getBasename(content) teamID = getBasename(getDirname(getDirname(content))) except: raise ValueError( "Could not get team year and ID from meta data: {0}". format(metadata)) if verydebug: print(year, '\t', teamID, '\t', ifile) ## Create Team Object teamData = team(year=year, teamName=teamName, teamMascot=None, teamID=teamID) tables = bsdata.findAll("table", {"class": "Table2__table"}) if verydebug: print("\tFound {0} game tables".format(len(tables))) for it, table in enumerate(tables): trs = table.findAll("tr") headers = trs[1] headers = [ x.text for x in headers.findAll("td") if x is not None ] gameRows = trs[2:] totalGames = len(gameRows) if verydebug: print("\tFound {0} potential games".format(totalGames)) for ig, tr in enumerate(gameRows): tds = tr.findAll("td") gameData = dict(zip(headers, tds)) extra = {"OT": False, "Bowl": False} ## Get the Date try: date = gameData["Date"] except: print(ifile) raise ValueError( "No date for this game! {0}".format(gameData)) date = date.text ## Only Keep Games With Regular Dates try: dateval = "{0} {1}".format( date.split(", ")[-1], year) date = getDateTime(dateval) except: date = None if date is None: continue ## Check for January Games (in the following year) if date.month == 1: date = addMonths(date, 12) ## Get the Opponent try: opponent = gameData["Opponent"] except: raise ValueError( "No opponent for this game! {0}".format(game)) try: oppolink = opponent.find("a") oppohref = oppolink.attrs['href'] opponame = getBasename(oppohref) oppoID = getBasename(getDirname(oppohref)) except: opponame = opponent.text oppoID = 0 #raise ValueError("Could not find href in link! {0}".format(opponent)) try: gamespan = opponent.find("span", {"class": "pr2"}) gametype = gamespan.text except: raise ValueError( "Could not find game type from {0}".format( opponent)) if gametype == "vs": location = teamID elif gametype == "@": location = oppoID else: raise ValueError( "Location --> {0}".format(gametype)) if verydebug: print("\t{0}/{1}\t{2}\t{3: <4}{4: <50}".format( ig, totalGames, printDateTime(date), gametype, opponame), end="\t") ## Get the Result try: result = gameData["Result"] except: raise ValueError( "No result for this game! {0}".format(game)) spans = result.findAll("span") if len(spans) == 0: continue if len(spans) != 2: raise ValueError( "There are {0} spans in this row!: {1}".format( len(spans), result)) outcome = spans[0].text.strip() score = spans[1].text.strip() if score.endswith("OT"): extra = {"OT": True} score = score[:-3].strip() try: scores = [int(x) for x in score.split('-')] except: raise ValueError( "Could not create integer scores from {0}". format(spans)) if outcome == 'W': teamScore = scores[0] oppoScore = scores[1] teamResult = "W" oppoResult = "L" elif outcome == "L": teamScore = scores[1] oppoScore = scores[0] teamResult = "L" oppoResult = "W" elif outcome == "T": teamScore = scores[0] oppoScore = scores[1] teamResult = "T" oppoResult = "T" else: raise ValueError( "Did not recognize game outcome {0}".format( outcome)) ## Get the Game try: gamelink = result.find("a") gamehref = gamelink.attrs['href'] except: raise ValueError( "Could not find href in link! {0}".format( result)) if verydebug: print("{0} {1}".format( teamResult, "-".join( str(x) for x in [teamScore, oppoScore]))) ## Create game object gameData = game(gameID=gameID, date=date, teamA=teamID, teamB=oppoID, teamAResult=teamResult, teamBResult=oppoResult, teamAScore=teamScore, teamBScore=oppoScore, location=location) ## Append game to team data teamData.addGame(gameData) ## Show Summary teamData.setStatistics() if debug: teamData.summary() if teamData.ngames == 0: removeFile(ifile, debug=True) seasonData.addTeam(teamData) #http://www.espn.com/college-football/team/schedule/_/id/201/season/2005" savename = setFile(self.getSeasonResultsDir(), "{0}.p".format(year)) saveFile(idata=seasonData, ifile=savename, debug=True)
def parseWikipediaOscarDataSpecial(self, ifile, debug=True): print("HI: {0}".format(ifile)) htmldata = getFile(ifile) bsdata = getHTML(htmldata) data = {} done = False tables = bsdata.findAll("table", {"class": "wikitable"}) if debug: print(" Found {0} tables".format(len(tables))) for table in tables: if done: if debug: print(" Breaking...") break ## Get <th> data ths = table.findAll("th") thData = [x.find('a') for x in ths] titles = [x.string for x in thData if x is not None] if len(titles) == 0: continue ## Get <tr> data trData = [] trs = table.findAll("tr") for tr in trs: tds = tr.findAll("td") for td in tds: ul = td.find("ul") if ul is not None: trData.append(ul) print(len(titles)) print(len(trData)) if len(titles) != len(trData): print("Can not process this data!") print("Titles: {0}: {1}".format(len(titles), titles)) print("Data: {0}".format(len(trData))) return None ## Merge titles and data for title, titleData in zip(titles, trData): results = [] lis = titleData.findAll("li") if debug: print(" Found {0} entries".format(len(lis))) for k, li in enumerate(lis): text = [] if k == 0: for lival in li.findAll("b"): for ref in lival.findAll("a"): text.append(ref.string) else: for ref in li.findAll("a"): text.append(ref.string) if len(text) == 0: continue if len(text) > 2: text = [text[0], ", ".join(text[1:])] text = self.reorderWikipediaOscarData(text, title) results.append(text) for k, result in enumerate(results): if isinstance(result, list): if len(result) == 1: results[k] = result[0] data[title] = {} data[title]["Winner"] = results[0] data[title]["Nominees"] = results[1:] if debug: print(" Winner :", data[title]["Winner"]) print(" Nominees:", data[title]["Nominees"]) print("") return data
def parseWikipediaOscarData(self, ifile, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) data = {} done = False tables = bsdata.findAll("table", {"class": "wikitable"}) if debug: print(" Found {0} tables".format(len(tables))) for table in tables: if done: if debug: print(" Breaking...") break trs = table.findAll("tr") if debug: print(" Found {0} rows".format(len(trs))) for i, tr in enumerate(trs): if done: if debug: print(" Breaking...") break tds = tr.findAll("td") if debug: print(" Found {0} cols".format(len(tds))) for j, td in enumerate(tds): div = td.find("div") if div == None: continue raise () ref = div.find("a") title = ref.string data[title] = {} if debug: print(" Found {0}".format(title)) if data.get(title): done = True if debug: print(" Already know about {0}".format(title)) print(" Breaking...") break results = [] ul = td.find("ul") lis = ul.findAll("li") #if debug: # print(" Found {0} entries".format(len(lis))) for k, li in enumerate(lis): text = [] if k == 0: for lival in li.findAll("b"): for ref in lival.findAll("a"): text.append(ref.string) else: for ref in li.findAll("a"): text.append(ref.string) if len(text) == 0: continue if len(text) > 2: text = [text[0], ", ".join(text[1:])] text = self.reorderWikipediaOscarData(text, title) results.append(text) if debug: print("Summary\n {0}: {1}".format(title, results)) for k, result in enumerate(results): if isinstance(result, list): if len(result) == 1: results[k] = result[0] data[title]["Winner"] = results[0] data[title]["Nominees"] = results[1:] if debug: print(" Winner :", data[title]["Winner"]) print(" Nominees:", data[title]["Nominees"]) print("") return data
def parseWikiFilmYearlyData(self, ifile, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) data = {} done = False tables = bsdata.findAll("table", {"class": "wikitable"}) if debug: print(" Found {0} tables".format(len(tables))) for table in tables: if len(data) > 0: break trs = table.findAll("tr") headerRow = trs[0] ths = headerRow.findAll("th") if len(ths) == 0: raise ValueError("There are no headers in the first row!") ths = [x.text for x in ths] ths = [x.replace("\n", "") for x in ths] try: maxCol = max([ths.index("Rank"), ths.index("Title")]) except: raise ValueError("No Rank/Title in header: {0}".format(ths)) pRank = None for itr, tr in enumerate(trs[1:]): tds = tr.findAll("td") th = tr.find("th") if th is not None: rank = th.text rank = rank.replace(".", "") rank = rank.replace("\n", "").strip() if len(rank) == 0: rank = pRank try: int(rank) except: raise ValueError( "Cannot create integer rank from {0}".format(rank)) rank = int(rank) tds = [x.text for x in tds] tds = [x.replace("\n", "").strip() for x in tds] tds.insert(0, rank) else: tds = [x.text for x in tds] tds = [x.replace("\n", "").strip() for x in tds] tds[0] = tds[0].replace(".", "") if len(tds[0]) == 0: tds[0] = pRank try: row = dict(zip(ths, tds)) except: row = None if row is None: try: row = { "Rank": tds[ths.index("Rank")], "Title": tds[ths.index("Title")] } except: print("Headers: {0}".format(ths)) print("Values: {1}".format(tds)) raise ValueError("Cannot create row dictionary") try: int(row['Rank']) except: raise ValueError( "There is no Ranking header in this row: {0}".format( row)) data[row['Title']] = int(row['Rank']) pRank = data[row['Title']] if debug: print(" Found {0} movies".format(len(data))) return data
def parseSearchArtist(self, artist, data, maxArtists=99, force=False, debug=False): if data is None: return None ## Parse data bsdata = getHTML(data) artistDB = {} h4s = bsdata.findAll("h4") for ih4, h4 in enumerate(h4s): spans = h4.findAll("span") ref = None if len(spans) == 0: ref = h4.find("a") else: ref = spans[0].find("a") if ref is None: continue try: href = ref.attrs.get('href') artist = ref.text.strip() except: print("Could not get artist/href from {0}".format(ref)) continue if not href.endswith("?anv="): if artistDB.get(href) is None: artistDB[href] = {"N": 0, "Name": artist} artistDB[href]["N"] += 1 if self.debug: print("Found {0} artists".format(len(artistDB))) iArtist = 0 for ia, (href, hrefData) in enumerate(artistDB.items()): iArtist += 1 if iArtist > maxArtists: break if href.startswith("/artist") is False: if debug: print( "href [{0}] does not start with /artist".format(href)) continue discID = self.dutils.getArtistID(href) url = self.getArtistURL(href) savename = self.dutils.getArtistSavename(discID) print(iArtist, '/', len(artistDB), '\t:', len(discID), '\t', url) if isFile(savename): if force is False: if debug: print("--> File exists.") continue if debug: print("Downloading {0} to {1} (Force={2})".format( url, savename, force)) retval = self.dutils.downloadArtistURL(url, savename, force=force, debug=True) #retval = self.dutils.downloadArtistURL(url, savename, force=force, sleeptime=self.sleeptime, if debug: print("Finished Downloading: Result is {0}".format(retval))
def parseGameData(self, startYear=2003, endYear=2018, debug=False, verydebug=False): noData = {} for year in range(startYear, endYear + 1): yearData = {} gamesDir = self.getYearlyGamesDir(year) files = findExt(gamesDir, ext=".p", debug=False) noData[year] = [] for i, ifile in enumerate(files): gameID = getBaseFilename(ifile) if gameID in self.noGameData: continue htmldata = getFile(ifile) bsdata = getHTML(htmldata) #print(bsdata) #verydebug=True #if gameID not in ['400603866']: # continue teamData = bsdata.findAll("div", {"class": "team-container"}) longNames = [ x.find("span", {"class": "long-name"}) for x in teamData ] longNames = [x.text for x in longNames if x is not None] shortNames = [ x.find("span", {"class": "short-name"}) for x in teamData ] shortNames = [x.text for x in shortNames if x is not None] teamAbbrevs = [ x.find("span", {"class": "abbrev"}) for x in teamData ] teamNames = [x.attrs for x in teamAbbrevs if x is not None] teamNames = [x['title'] for x in teamNames] teamAbbrevs = [x.text for x in teamAbbrevs] teamIDs = [ x.find("img", {"class": "team-logo"}) for x in teamData ] teamIDs = [x.attrs for x in teamIDs if x is not None] teamIDs = [x['src'] for x in teamIDs] teamIDs = [re.search(r"(\d+).png", x) for x in teamIDs] teamIDs = [x.groups()[0] for x in teamIDs] awayTeam = { "Name": longNames[0], "Mascot": shortNames[0], "Abbrev": teamAbbrevs[0], "ID": teamIDs[0] } homeTeam = { "Name": longNames[1], "Mascot": shortNames[1], "Abbrev": teamAbbrevs[1], "ID": teamIDs[1] } metadata = bsdata.find("meta", {"property": "og:title"}) title = None if metadata is not None: title = metadata.attrs['content'] if verydebug: print("==> {0}".format(title)) ## Possesions posData = bsdata.find("ul", {"class": "css-accordion"}) if posData is None: posData = bsdata.find("article", {"class": "play-by-play"}) if posData is None: noData[year].append(gameID) if verydebug: print("Could not find possession data! {0}".format( gameID)) continue #print(bsdata) #1/0 #removeFile(ifile, debug) #continue gameData = { "Teams": { "Away": awayTeam, "Home": homeTeam }, "Plays": [] } if i % 10 == 0: print("{0}/{1} with {2} no data games".format( i, len(files), len(noData[year]))) ################### ## Get Full Drive Data ################### drives = posData.findAll("li", {"class": "accordion-item"}) if verydebug: print("Drives {0}".format(len(drives))) for idr, drive in enumerate(drives): ## Get Drive Summary headlines = [ x.text.strip() for x in drive.findAll("span", {"class": "headline"}) ] if verydebug: print("Headlines {0}".format(len(headlines))) ## Get Drive Details details = [ x.text.strip() for x in drive.findAll( "span", {"class": "drive-details"}) ] if verydebug: print("Details {0}".format(len(details))) ## Get Home Score homescores = drive.findAll("span", {"class": "home"}) homescores = [ x.find("span", {"class": "team-score"}) for x in homescores ] homescores = [x.text for x in homescores if x is not None] if verydebug: print("Home Scores {0}".format(len(homescores))) ## Get Away Score awayscores = drive.findAll("span", {"class": "away"}) awayscores = [ x.find("span", {"class": "team-score"}) for x in awayscores ] awayscores = [x.text for x in awayscores if x is not None] if verydebug: print("Away Scores {0}".format(len(awayscores))) ## Get Possession possessions = drive.findAll("span", {"class": "home-logo"}) possessions = [ x.find("img", {"class": "team-logo"}) for x in possessions ] possessions = [ x.attrs['src'] for x in possessions if x is not None ] possessions = [x.split('&')[0] for x in possessions] possessions = [getBaseFilename(x) for x in possessions] if verydebug: print("Possessions {0}".format(len(possessions))) ## Check for valid headline (parsed correctly?) if len(headlines) == 0: continue validFGs = [ "Missed FG", "Field Goal", "FIELD GOAL", "MISSED FG", "Made FG", "Field Goal Good", "Field Goal Missed", "Blocked FG" ] validTDs = [ "Touchdown", "TOUCHDOWN", "END OF HALF Touchdown", "Downs Touchdown", "Missed FG Touchdown", "End of Half Touchdown", "End of Game Touchdown", "PUNT Touchdown", "FUMBLE Touchdown", "INTERCEPTION Touchdown", "FIELD GOAL Touchdown", "MISSED FG Touchdown", "Rushing Touchdown", "Passing Touchdown", "Kickoff Return Touchdown", "Interception Return Touch", "Turnover on Downs Touchdown", "Field Goal Missed Touchdown", "Field Goal Touchdown", "Rushing Touchdown Touchdown", "Field Goal Good Touchdown", "Passing Touchdown Touchdown", "Fumble Return Touchdown Touchdown", "Rushing TD", "Passing TD", "Blocked Punt TD", "Punt Return TD", "Fumble Ret. TD", "Interception TD", "Fumble TD", "Rushing TD Touchdown", "Blocked Punt TD Touchdown", "Blocked FG (TD)", "Punt Return TD Touchdown", "Kick Return TD", "Kickoff Return Touchdown Touchdown", "Missed FG (TD) Touchdown", "Blocked FG (TD) Touchdown", "Punt Return Touchdown Touchdown", "Interception Return Touch Touchdown" ] validEnds = [ "End of Half", "End of Game", "END OF HALF", "END OF GAME", "End of 4th Quarter" ] validTOs = [ "Fumble", "Interception", "FUMBLE", "INTERCEPTION", "Kickoff", "KICKOFF", "Blocked Punt" ] validTOPnts = [ "Interception Touchdown", "Safety", "Punt Touchdown", "Fumble Touchdown", "Punt Return Touchdown", "Fumble Return Touchdown", "SAFETY" ] validDowns = [ "Punt", "Downs", "PUNT", "Possession (For OT Drives)", "DOWNS", "Possession (For OT Drives) Touchdown", "Turnover on Downs", "Poss. on downs", "Penalty" ] validPlay = [ "Rush", "Pass", "Sack", "Timeout", "Incomplete", "Pass Complete" ] valid2PT = ["2PT Pass failed", "Missed PAT Return"] validOdds = ["on-side kick"] validHeadlines = validFGs + validTDs + validEnds + validTOs + validTOPnts + validDowns + validPlay + valid2PT isValidHeadline = sum( [x in validHeadlines for x in headlines]) if headlines[0] == '': continue if isValidHeadline == 0 and idr < len(drives) - 1: print(idr, '/', len(drives)) print(title) print(ifile) #print(bsdata) raise ValueError( "No valid headline in {0}".format(headlines)) print("No valid headline in {0}".format(headlines)) continue ## Analyze Play-by-Play try: driveList = drive.find("ul", {"class": "drive-list"}) plays = driveList.findAll("li") except: raise ValueError( "Could not find drive list in drive {0}".format( drive)) driveData = [] for ip, play in enumerate(plays): ## Check for Starting Position startPos = play.find("h3") if startPos is None: raise ValueError( "Could not find Starting Position in Play! {0}" .format(play)) startData = startPos.text.strip() ## Check for Play Text span = play.find("span", {"class": "post-play"}) if span is None: raise ValueError( "Could not find post play data! {0}".format( play)) playData = span.text.strip() driveData.append({ "Play": ip, "Start": startData, "Data": playData }) #print(idr,'\t',ip,'\t',startData,'\t',playData) ## Save Drive Data gameData["Plays"].append({ "Drive": len(gameData), "Headline": headlines, "Detail": details, "HomeScore": homescores, "AwayScore": awayscores, "Possession": possessions, "Data": driveData }) if verydebug: print(idr, '\t', headlines) print(idr, '\t', details) print(idr, '\t', homescores) print(idr, '\t', awayscores) print(idr, '\t', possessions) print("") if verydebug: print("Found {0} drives for gameID {1}".format( len(gameData), gameID)) yearData[gameID] = gameData print("Parsed {0}/{1} games in {2}".format(len(yearData), len(files), year)) savename = setFile(self.getGamesResultsDir(), "{0}-games.p".format(year)) saveFile(idata=yearData, ifile=savename, debug=True) return noData