def createRawOscarData(self, debug=True): print("Checking for poorly parsed oscar data.") indir = self.wikiData.getResultsDir() files = sorted(findExt(indir, ext=".json")) if debug: print("Found {0} oscar files".format(len(files))) yearlyData = {} for ifile in files: year = getBaseFilename(ifile) yearlyData[year] = getFile(ifile) savename = setFile(self.getCorrectionsDir(), "saved.yaml") if not isFile(savename): savedData = {} else: savedData = getFile(savename) for year in savedData.keys(): for title in savedData[year].keys(): savedWinner = savedData[year][title].get("Winner") savedNominees = savedData[year][title].get("Nominees") if savedWinner is not None: print("Overwritting {0} {1} winner".format(year, title)) yearlyData[year][title]["Winner"] = savedWinner if savedNominees is not None: print("Overwritting {0} {1} nominees".format(year, title)) yearlyData[year][title]["Nominees"] = savedNominees savename = setFile(self.getCorrectionsDir(), "raw.yaml") saveFile(idata=yearlyData, ifile=savename)
def findSearchTerms(self, minCnts=25): from collections import Counter from time import sleep from glob import glob artistsCntr = Counter() known = getFile(self.knownFile) files = getFlatList([findExt(dirval, ext='.p') for dirval in self.getModValDirs()]) for ifile in files: #for ifile in glob("/Volumes/Piggy/Discog/artists-datpiff/*/*.p"): if ifile.endswith("datPiffKnown.p"): continue tmp = getFile(ifile) #print(ifile,'\t',len(tmp)) results = [x["ArtistName"] for x in tmp] for artist in results: artists = self.mulArts.getArtistNames(artist) for artist in artists.keys(): key = artist.title() if len(key) > 1 and key not in known: artistsCntr[key] += 1 searchTerms = [item[0] for item in artistsCntr.most_common() if item[1] >= minCnts] print("There are {0} new searches".format(len(searchTerms))) return searchTerms
def saveCorrections(self, debug=True): corrsavename = setFile(self.getDataDir(), "corr.yaml") corrData = getFile(corrsavename) try: savename = setFile(self.getDataDir(), "saved.yaml") savedData = getFile(savename) except: raise ValueError("Could not access saved data!") savedData = {} if corrData is None: print("There is no corrections data.") else: print("Found {0} old corrections".format(len(savedData))) print("Found {0} new corrections".format(len(corrData))) for movie,corrs in corrData.items(): if savedData.get(movie) is None: if debug: print("Adding {0}".format(movie)) savedData[movie] = corrs else: newSaved = list(set(savedData[movie] + corrs)) if len(newSaved) != len(savedData[movie]): print("Adding new corrections to {0}".format(movie)) savedData[movie] = newSaved try: savename = setFile(self.getDataDir(), "saved.yaml") saveFile(idata=savedData, ifile=savename, debug=debug) print("There are {0} total corrections".format(len(savedData))) except: raise ValueError("There was an error saving the saved corrctions yaml file!")
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw Files") tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistRawFiles(datatype=self.datatype, expr=expr, force=force) tsFiles.stop() N = len(newFiles) tsParse = timestat("Parsing {0} New Raw Files".format(N)) newData = 0 modValue = 250 if N >= 500 else 50 for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N: tsParse.update(n=i+1, N=N) #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile)) htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID if artistID is None: continue savename = self.dutils.getArtistSavename(artistID) if savename is None: continue saveFile(idata=htmldata, ifile=savename, debug=False) newData += 1 print("Created {0}/{1} New Artist Files".format(newData, N)) tsParse.stop()
def parse(self, expr, force=False, debug=False, quiet=False): ts = timestat("Parsing Raw HTML Files") tsFiles = timestat("Finding Files To Parse") newFiles = self.getArtistRawHTMLFiles(expr, force) tsFiles.stop() if debug: print("Parsing {0} Raw HTML Files From Expr[{1}]".format(len(newFiles), expr)) N = len(newFiles) modValue = 250 if N >= 500 else 50 tsParse = timestat("Parsing {0} Raw HTML Files".format(N)) for i,ifile in enumerate(newFiles): if (i+1) % modValue == 0 or (i+1) == N or debug: tsParse.update(n=i+1, N=N) #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile)) if debug: print("{0}/{1}\tParsing {2}".format(i,N,ifile)) htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID if debug: print(" ---> ID={0}".format(artistID)) savename = self.dutils.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=False) tsParse.stop() ts.stop()
def parseBAFTACategoryData(self, ifile, category, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) data = {} done = False tables = bsdata.findAll("table", {"class": "wikitable"}) if debug: print(" Found {0} tables".format(len(tables))) for table in tables: if category == "Best_Direction": yeardata = self.parseBAFTADirectorData(table, category, debug=False) else: yeardata = self.parseBAFTAFilmData(table, category, debug=False) data = {**data, **yeardata} for year, yearData in data.items(): for category in yearData.keys(): data[year][category] = list(set(data[year][category])) return data
def parseAndDownloadTeamYearlyStandings(self): files = findExt(self.getSeasonDir(), ext=".p", debug=False) for ifile in files: year = getBaseFilename(ifile) htmldata = getFile(ifile) bsdata = getHTML(htmldata) idVals = {} links = bsdata.findAll("a") for link in links: attrs = link.attrs if attrs.get("data-clubhouse-uid") is not None: href = attrs['href'] name = getBasename(href) idval = getBasename(getDirname(href)) if idVals.get(idval) is not None: if idVals[idval] != name: raise ValueError("Error in ID for this year!") idVals[idval] = name for idVal, name in idVals.items(): self.downloadTeamDataByYear(idVal, name, season=str(year), debug=True)
def rmIDFromDB(self, artistID, modValue=None): print("Trying to remove data from ArtistID {0}".format(artistID)) if modValue is None: modValue = self.dutils.getDiscIDHashMod(discID=artistID, modval=self.disc.getMaxModVal()) artistDBDir = self.disc.getArtistsDBDir() dbname = setFile(artistDBDir, "{0}-DB.p".format(modValue)) print("Loading {0}".format(dbname)) dbdata = getFile(dbname) saveVal = False if isinstance(artistID, str): artistID = [artistID] elif not isinstance(artistID, list): raise ValueError("Not sure what to do with {0}".format(artistID)) for ID in artistID: try: del dbdata[ID] print("Deleted {0}".format(ID)) saveVal = True except: print("Not there...") self.rmIDFiles(ID) if saveVal: print("Saving {0}".format(dbname)) saveFile(idata=dbdata, ifile=dbname) else: print("No reason to save {0}".format(dbname))
def parseFilmsiteYearlyData(self, ifile, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) movies = [] tables = bsdata.findAll("table") tables = tables[1:] for table in tables: trs = table.findAll("tr") trs = trs[1:] for tr in trs: tds = tr.findAll("td") if len(tds) == 2: mdata = tds[1].find("b") if mdata is not None: movie = mdata.text movie = "".join( [c for c in movie if ord(c) not in [10, 13]]) while movie.find(" ") != -1: movie = movie.replace(" ", " ") pos = movie.rfind("(") if pos != -1: movie = movie[:pos].strip() movies.append(movie) return movies
def parseRazziesCategoryData(self, ifile, category, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) data = {} done = False tables = bsdata.findAll("table", {"class": "wikitable"}) if debug: print(" Found {0} tables".format(len(tables))) for table in tables: caption = table.find("caption") if caption is None: yeardata = self.parseRazziesActingData(table, category, debug=False) data = {**data, **yeardata} else: yeardata = self.parseRazziesFilmData(table, category, debug=False) data = {**data, **yeardata} for year, yearData in data.items(): for category in yearData.keys(): data[year][category] = list(set(data[year][category])) return data
def processFlopsData(self, debug=False): outdir = self.getDataDir() files = findExt(outdir, ext=".html") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in files: htmldata = getFile(ifile) bsdata = getHTML(htmldata) tables = bsdata.findAll("table", {"class": "wikitable"}) for table in tables: trs = table.findAll("tr") try: ths = trs[0].findAll("th") ths = [x.text for x in ths] ths = [x.replace("\n", "") for x in ths] except: raise ValueError("Could not get headers") print(ths) for itr, tr in enumerate(trs[2:]): ths = tr.findAll("th") try: movie = ths[0].text movie = movie.replace("\n", "").strip() movie = movie.replace("[nb 2]", "") except: raise ValueError( "Could not find movie in {0}".format(ths)) tds = tr.findAll("td") try: year = tds[0].text year = int(year) except: raise ValueError( "Could not find year in {0}".format(tds)) print(year, '\t', movie) if yearlyData.get(year) is None: yearlyData[year] = [] yearlyData[year].append(movie) for year in sorted(yearlyData.keys()): movies[year] = [] for movie in yearlyData[year]: movies[year].append([movie, 10]) savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of flops Data to {1}".format( len(movies), savename)) saveFile(savename, movies)
def parseArtistFiles(self, force=False, debug=False): from glob import glob artistDir = self.disc.getArtistsDir() artistDBData = {} files = findExt(self.knownDir, ext='.p') files = glob("/Volumes/Biggy/Discog/artists-datpiff/*/*.p") print("Found {0} downloaded search terms".format(len(files))) for i,ifile in enumerate(files): if ifile.endswith("datPiffKnown.p"): continue fileresults = getFile(ifile) if debug: print(i,'/',len(files),'\t',ifile) for j,fileresult in enumerate(fileresults): if debug: print(" ",j,'/',len(fileresults)) mixArtists = fileresult["ArtistName"] albumName = fileresult["AlbumName"] albumURL = fileresult["AlbumURL"] mixArtistNames = self.mulArts.getArtistNames(mixArtists) mixArtistNames = [x.title() for x in mixArtistNames.keys()] for artistName in mixArtistNames: artistID = str(self.dutils.getArtistID(artistName)) albumID = str(self.dutils.getArtistID(albumName)) modval = self.dutils.getArtistModVal(artistID) if artistDBData.get(modval) is None: artistDBData[modval] = {} if artistDBData[modval].get(artistName) is None: artistDBData[modval][artistName] = {"Name": artistName, "ID": artistID, "URL": None, "Profile": None, "Media": []} albumData = {"Artists": mixArtistNames, "Name": albumName, "URL": albumURL, "Code": albumID} artistDBData[modval][artistName]["Media"].append(albumData) maxModVal = self.disc.getMaxModVal() artistDBDir = self.disc.getArtistsDBDir() totalSaves = 0 for modVal,modvaldata in artistDBData.items(): dbData = {} for artistName, artistData in modvaldata.items(): self.artist.setData(artistData) artistVal = self.artist.parse() dbData[artistVal.ID.ID] = artistVal savename = setFile(artistDBDir, "{0}-DB.p".format(modVal)) print("Saving {0} artist IDs to {1}".format(len(dbData), savename)) totalSaves += len(dbData) saveFile(idata=dbData, ifile=savename) self.createArtistModValMetadata(modVal=modVal, db=dbData, debug=debug) self.createArtistAlbumModValMetadata(modVal=modVal, db=dbData, debug=debug) print("Saved {0} new artist IDs".format(totalSaves))
def __init__(self, path, chart, debug=False): self.debug = debug self.chart = chart self.path = path if chart is None: fullChartData = {} artistAlbumData = {} for chart in [ "MusicVF", "Billboard", "BillboardYE", "RateYourMusic", "RateYourMusicSong", "RateYourMusicList", "RateYourMusicList2" ]: print(chart) fullChartData.update( getFile( setFile( path, "current{0}FullChartArtistAlbumData.p".format( chart.lower())))) print("There are {0} artists in the full chart data".format( len(fullChartData))) artistAlbumData.update( getFile( setFile( path, "current{0}ArtistAlbumData.p".format( chart.lower())))) print("There are {0} artists in the artist album data".format( len(artistAlbumData))) self.fullChartData = fullChartData self.artistAlbumData = artistAlbumData else: self.fullChartData = getFile( setFile( path, "current{0}FullChartArtistAlbumData.p".format( chart.lower()))) print("There are {0} artists in the full chart data".format( len(self.fullChartData))) self.artistAlbumData = getFile( setFile(path, "current{0}ArtistAlbumData.p".format(chart.lower()))) print("There are {0} artists in the artist album data".format( len(self.artistAlbumData))) self.artistData = {} self.artistKeyToNameMap = {}
def getMyMovies(self, debug=False): savename = setFile(self.getDataDir(), "mymovies.json") if not isFile(savename): raise ValueError("Cannot access {0}".format(savename)) mine = getFile(savename) if debug: print("Found {0} my movies".format(len(mine))) return mine
def parseArtistMetadataFiles(self, debug=False): artistDBDir = self.disc.getArtistsDBDir() maxModVal = self.disc.getMaxModVal() for modVal in range(maxModVal): savename = setFile(artistDBDir, "{0}-DB.p".format(modVal)) dbdata = getFile(savename) self.createArtistModValMetadata(modVal=modVal, db=dbdata, debug=debug) self.createArtistAlbumModValMetadata(modVal=modVal, db=dbdata, debug=debug)
def getCombinedMovies(self, debug=False): savename = setFile(self.combine.getResultsDir(), "movies.json") if not isFile(savename): raise ValueErrro("Cannot access {0}".format(savename)) combinedMovies = getFile(savename) if debug: print("Found {0} combined movies".format(len(combinedMovies))) return combinedMovies
def assertDBModValExtraData(self, modVal, minPages=1, maxPages=None, allowMulti=False, test=True, clean=True): print("assertDBModValExtraData(",modVal,")") artistDBDir = self.disc.getArtistsDBDir() dbname = setFile(artistDBDir, "{0}-DB.p".format(modVal)) dbdata = getFile(dbname) nerrs = 0 #ignores = self.artistIgnoreList() for artistID,artistData in dbdata.items(): first = True pages = artistData.pages if pages.more is True: npages = pages.pages if npages < minPages: continue if maxPages is not None: npages = min([npages, maxPages]) artistRef = artistData.url.url #if artistData.artist.name in ignores: # print("\tNot downloading artist in ignore list: {0}".format(artistData.artist.name)) # continue #savename = self.dutils.getArtistSavename(artistID) #removeFile(savename) #print("\t---> {0} / {1} {2}".format(1, pages.pages, savename)) #print(artistID,'\t',npages,'\t') #continue for p in range(1, npages+1): if p == 1: url = self.getArtistURL(artistRef) savename = self.dutils.getArtistSavename(artistID) else: url = self.getArtistURL(artistRef, p) savename = self.dutils.getArtistSavename(artistID, p) print("\t---> {0} / {1} {2}".format(p, pages.pages, url)) if clean is True: if isFile(savename): print("Removing {0}".format(savename)) removeFile(savename) if test is True: print("\t\tWill download: {0}".format(url)) print("\t\tJust testing... Will not download anything.") continue if not isFile(savename): if first: print("{0: <20}{1: <10}{2}".format(artistID,pages.tot,artistData.artist.name)) first = False print("{0: <20}{1: <10}{2}".format(artistID, "{0}/{1}".format(p,pages.pages), url)) self.dutils.downloadArtistURL(url=url, savename=savename, force=True) sleep(3)
def parseBoxOfficeMojo(self, ifile, debug=False): htmldata = getFile(ifile) bsdata = getHTML(htmldata) tbl = None for table in bsdata.findAll("table"): if tbl: break for tr in table.findAll("tr"): if len(tr) >= 10: tbl = table break else: break #print len(tbl) keys = [] data = [] for i, tr in enumerate(tbl): vals = [] if i == 0: for j, td in enumerate(tr.findAll("td")): for ref in td.findAll("a"): key = ref.string keys.append(key) else: if len(tr) <= 1: continue #print "\n\n\nNext...." #print tr #print " tr-->",tr,'\t',len(tr) #print i,tr,len(data) for j, td in enumerate(tr.findAll("td")): if td.string == None: continue try: if re.search("TOTAL \((\d+) MOVIES\)", td.string): break except: print(j, td.string) raise () key = keys[j] val = td.string vals.append(val) #print j,'\t',keys[j],'\t',td.string if len(vals) == 0: break if len(vals) != len(keys): print("Mismatch with keys/data") print(len(keys), '\t', keys) print(len(vals), '\t', vals) raise ("YO") break else: data.append(vals) if debug: print("Found", len(data), "movies from", ifile) return data
def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movieData = OrderedDict() for ifile in sorted(files): #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p" htmldata = getFile(ifile) bsdata = getHTML(htmldata) year = getBaseFilename(ifile) data = {} done = False tables = bsdata.findAll("table") #, {"id": "table_3"}) movies = {} for it, table in enumerate(tables): ths = table.findAll("th") trs = table.findAll("tr") for itr, tr in enumerate(trs): tds = tr.findAll("td") if len(tds) == 11: val = removeTag(tds[1], 'span') film = val.text film = film.replace(" ({0})".format(year), "") try: rank = float(tds[-1].text) except: try: rank = float(tds[-2].text) except: raise ValueError(tds[-1], tds[-2], tr) movies[film] = rank movieData[year] = movies yearlyData = {} for year in sorted(movieData.keys()): yearlyData[year] = sorted(movieData[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format( len(yearlyData), savename)) saveFile(savename, yearlyData)
def parseRottenTomatoesFile(self, ifile, debug=False): movies = {} if debug: print("Parsing {0}".format(ifile)) htmldata = getFile(ifile) bsdata = getHTML(htmldata) table = bsdata.find("table", {"class": "table"}) if table: keys = [] for tr in table.findAll("tr"): if len(keys) == 0: for th in tr.findAll("th"): key = th.string if key == None: key = " ".join( [x.string for x in th.findAll("span")]) keys.append(key) #print key else: line = [] for i, td in enumerate(tr.findAll("td")): #print i,'\t',td if i == 0 or i == 3: val = td.string if i == 1: for span in td.findAll("span"): if span.string: val = span.string break if i == 2: ref = td.find("a") #link = ref.attrs["href"] val = ref.string val = val.strip() line.append(val) #print i,'\t',val.strip() movie = line[2] rating = line[1] rating = rating.replace("%", "") rating = int(rating) retval = re.search("\((\d+)\)", movie) if retval: year = retval.group() movie = movie.replace(year, "").strip() year = retval.groups()[0] #retval = search(r'(%d+)', movie) if movies.get(year) == None: movies[year] = {} movies[year][movie] = rating #print year,'\t',rating,'\t',movie return movies
def searchBoxOfficeMojo(self, movie, debug=False): savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) data = getFile(savename) print("Nearest matches for {0}".format(movie)) for year, yearlyMovies in data.items(): result = findNearest(movie, [x[0] for x in yearlyMovies], num=1, cutoff=0.9) if len(result) > 0: values = [(name, value) for name, value in yearlyMovies if name in result] print("{0: <6}{1}".format(year, values))
def parseDownloadedFiles(self): artistDir = self.disc.getArtistsDir() dataDir = setDir(artistDir, "data") files = findPatternExt(dataDir, pattern="Discography and Albums", ext=".htm") for ifile in files: htmldata = getFile(ifile) retval = self.getData(ifile) artistID = retval.ID.ID savename = self.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=True)
def parseDownloadedFiles(self, previousDays=None, force=False): artistDir = self.disc.getArtistsDir() files = self.getArtistRawHTMLFiles(previousDays=None, force=False) return dataDir = setDir(artistDir, "data") files = findPatternExt(dataDir, pattern="Rate Your Music", ext=".html") for ifile in files: htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID savename = self.dutils.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=False)
def moveMyMatchedMusicAlbums(self, show=False): rename = True albumsToMove = getFile(ifile=self.moveFilename) print("Found {0} music <-> discogs albums maps".format( len(albumsToMove))) for db, dbValues in albumsToMove.items(): if dbValues is None: continue for artistName, artistAlbums in dbValues.items(): print("==>", artistName) for myAlbumName, albumVals in artistAlbums.items(): dirval = albumVals["Dir"] albumVal = albumVals["Album"] ratio = albumVals["Ratio"] dbAlbumName = albumVal["Name"] dbAlbumCode = albumVal["Code"] mediaType = albumVal["MediaType"] matchedDir = setDir(dirval, "Match") mkDir(matchedDir) srcName = myAlbumName srcDir = setDir(dirval, srcName) if not isDir(srcDir): print("{0} does not exist".format(srcDir)) continue mediaDir = setDir(matchedDir, self.discConv(mediaType)) mkDir(mediaDir) if rename is True: dstName = self.getMatchedDirName( self.discConv(dbAlbumName), dbAlbumCode, db) else: dstName = self.getMatchedDirName( myAlbumName, dbAlbumCode, db) if show is True: print('\t{0}'.format(mediaDir)) print("\t\t[{0}]".format(srcName)) print("\t\t[{0}]".format(dstName)) continue dstDir = setDir(mediaDir, dstName) if isDir(dstDir): print("{0} already exists".format(dstDir)) continue print("\tMoving {0} ---> {1}".format(srcDir, dstDir)) moveDir(srcDir, dstDir, debug=True)
def parseFilms101YearlyData(self, ifile, debug=False): if debug: print(ifile) htmldata = getFile(ifile) bsdata = getHTML(htmldata) movies = [] headertables = bsdata.findAll("table", {"class": "lsthdg"}) datatables = bsdata.findAll("table", {"class": "lstdta"}) if len(headertables) < len(datatables): print(headertables) raise ValueError("Found {0} headers and {1} data tables".format(len(headertables), len(datatables))) if debug: print("Found {0} tables".format(len(datatables))) for i in range(len(datatables)): headertable = headertables[i] tds = headertable.findAll("td") headers = [x.text for x in tds if x is not None] headers = [x.strip() for x in headers] datatable = datatables[i] trs = datatable.findAll("tr") expect = len(trs) for tr in trs: tds = tr.findAll("td") tds = [x.text for x in tds if x is not None] if len(tds) != len(headers): print(headers) print(tds) 1/0 try: mdata = dict(zip(headers, tds)) except: print(headers) print(tds) raise ValueError("Could not combine headers and data") try: movie = mdata['TITLE'] except: raise ValueError("Could not get movie name from TITLE key! {0}".format(mdata)) movies.append(movie) if debug: print("Found {0}/{1} movies".format(len(movies), expect)) return movies
def getDBData(self, dbname, prefix, returnName=False, debug=False): savename = setFile(self.getDiscogDBDir(), "{0}{1}.p".format(prefix, dbname)) if self.debug is True: print("Data stored in {0}".format(savename)) if returnName is True: return savename if not isFile(savename): raise ValueError("Could not find {0}".format(savename)) if self.debug: print("Returning data from {0}".format(savename)) data = getFile(savename, debug=debug) return data
def getData(self, inputdata): if isinstance(inputdata, str): if isFile(inputdata): try: bsdata = getHTML(getFile(inputdata)) except: try: bsdata = getHTML(getFile(inputdata, version=2)) except: raise ValueError("Cannot read artist file: {0}".format(inputdata)) else: try: bsdata = getHTML(inputdata) except: raise ValueError("Not sure about string input: {0} . It is not a file".format(inputdata)) elif isBS4(inputdata): bsdata = inputdata pass else: raise ValueError("Not sure about input type: {0}".format(type(inputdata))) self.bsdata = bsdata return self.parse()
def getData(self): years = [] for key in self.sources: resultsDir = self.movieSource[key].getResultsDir() resultsName = self.movieSource[key].name filename = setFile(resultsDir, "{0}.json".format(resultsName)) if isFile(filename): self.movieSourceData[key] = getFile(filename) self.movieSourceYears[key] = list(self.movieSourceData[key].keys()) print("Found {0} Years of {1} Movies".format(len(self.movieSourceYears[key]), key)) years = years + self.movieSourceYears[key] else: raise ValueError("There is not results file: {0}".format(filename)) self.years = sorted(list(set(years))) print("Found Data Between {0} and {1}".format(min(self.years), max(self.years)))
def searchForArtist(self, artist): print("\n\n===================== Searching For {0} =====================".format(artist)) url = self.getSearchArtistURL(artist) if url is None: raise ValueError("URL is None!") ## Download data data, response = self.downloadURL(url) if response != 200: print("Error downloading {0}".format(url)) return False known = getFile(self.knownFile) print(" Found {0} previously searched for terms.".format(len(known))) known.append(artist) saveFile(idata=known, ifile=self.knownFile) self.parseSearchArtist(artist, data)
def downloadTeamStatisticsData(self, debug=False): resultsDir = self.getSeasonResultsDir() files = findExt(resultsDir, ext=".p", debug=False) sleep(3) for ifile in files: seasonData = getFile(ifile) year = seasonData.getYear() gamesDir = self.getYearlyGamesDir(year) if year != 2014: continue teams = seasonData.teams for teamID, teamData in teams.items(): name = teamData.teamName self.downloadTeamStatisticsDataByYear(teamID, name, year, debug)