def processWikiFilmYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movies = OrderedDict() yearlyData = {} for ifile in sorted(files): if debug: print("Processing {0}".format(ifile)) year = getBaseFilename(ifile) movies[year] = self.parseWikiFilmYearlyData(ifile, debug=False) yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=False) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} WikiFilm data to {1}".format(len(yearlyData), savename)) saveFile(savename, yearlyData)
def createArtistMetadataMap(self): ts = timestat("Creating Artist DBs") artistIDGenre = {} artistIDStyle = {} artistIDCollaborations = {} albumsMetadataDBDir = self.disc.getAlbumsMetadataDBDir() files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistMetadata", ext='.p') for ifile in files: print(ifile,'\t',end="") for artistID,artistData in getFile(ifile).items(): genre = artistData['Genre'] artistIDGenre[artistID] = genre artists = artistData['Artists'] artistIDCollaborations[artistID] = artists style = artistData['Style'] artistIDStyle[artistID] = style print(len(artistIDGenre)) print("\n\n==============================================\n") savenames = {"IDToGenre": artistIDGenre, "IDToStyle": artistIDStyle, "IDToCollaborations": artistIDCollaborations} for basename,savedata in savenames.items(): savename = setFile(self.disc.getDiscogDBDir(), "Artist{0}.p".format(basename)) print("Saving {0} entries to {1}\n".format(len(savedata), savename)) saveFile(ifile=savename, idata=Series(savedata), debug=True) ts.stop()
def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movieData = OrderedDict() for ifile in sorted(files): #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p" htmldata = getFile(ifile) bsdata = getHTML(htmldata) year = getBaseFilename(ifile) data = {} done = False tables = bsdata.findAll("table") #, {"id": "table_3"}) movies = {} for it, table in enumerate(tables): ths = table.findAll("th") trs = table.findAll("tr") for itr, tr in enumerate(trs): tds = tr.findAll("td") if len(tds) == 11: val = removeTag(tds[1], 'span') film = val.text film = film.replace(" ({0})".format(year), "") try: rank = float(tds[-1].text) except: try: rank = float(tds[-2].text) except: raise ValueError(tds[-1], tds[-2], tr) movies[film] = rank movieData[year] = movies yearlyData = {} for year in sorted(movieData.keys()): yearlyData[year] = sorted(movieData[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format( len(yearlyData), savename)) saveFile(savename, yearlyData)
def parseDownloadedFiles(self, previousDays=None, force=False): artistDir = self.disc.getArtistsDir() files = self.getArtistRawHTMLFiles(previousDays=None, force=False) return dataDir = setDir(artistDir, "data") files = findPatternExt(dataDir, pattern="Rate Your Music", ext=".html") for ifile in files: htmldata = getFile(ifile) retval = self.artist.getData(ifile) artistID = retval.ID.ID savename = self.dutils.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=False)
def parseDownloadedFiles(self): artistDir = self.disc.getArtistsDir() dataDir = setDir(artistDir, "data") files = findPatternExt(dataDir, pattern="Discography and Albums", ext=".htm") for ifile in files: htmldata = getFile(ifile) retval = self.getData(ifile) artistID = retval.ID.ID savename = self.getArtistSavename(artistID) saveFile(idata=htmldata, ifile=savename, debug=True)
def parseBoxOfficeMojoResults(self, startYear=1982, endYear=2017, debug=False): outdir = self.getDataDir() resultsdir = self.getResultsDir() if endYear == None: endYear = startYear years = range(int(startYear), int(endYear) + 1) for year in years: retval = [] files = findPatternExt(outdir, pattern=str(year), ext=".p") for ifile in files: result = self.parseBoxOfficeMojo(ifile, debug=debug) retval.append(result) savename = setFile(resultsdir, str(year) + ".json") print("Saving", len(retval), "weekends of movie data to", savename) saveFile(savename, retval)
def createAlbumIDMap(self): ts = timestat("Creating Artist DBs") albumIDToName = {} albumIDToRef = {} albumIDToArtists = {} albumsMetadataDBDir = self.disc.getAlbumsMetadataDBDir() files = findPatternExt(albumsMetadataDBDir, pattern="-ArtistAlbums", ext='.p') for ifile in files: print(ifile,'\t',end="") for artistID,artistData in getFile(ifile).items(): for albumID,albumData in artistData.items(): albumName = albumData[0] albumRef = albumData[1] albumCountry = albumData[2].most_common(1)[0] albumYear = albumData[3].most_common(1)[0] albumIDToName[albumID] = albumName albumIDToRef[albumID] = albumRef if albumIDToArtists.get(albumID) is None: albumIDToArtists[albumID] = [] albumIDToArtists[albumID].append(artistID) print(len(albumIDToArtists)) print("\n\n==============================================\n") for albumID in albumIDToArtists.keys(): albumIDToArtists[albumID] = list(set(albumIDToArtists[albumID])) print("\n\n==============================================\n") savenames = {"IDToName": albumIDToName, "IDToRef": albumIDToRef, "IDToArtists": albumIDToArtists} for basename,savedata in savenames.items(): savename = setFile(self.disc.getDiscogDBDir(), "Album{0}.p".format(basename)) print("Saving {0} entries to {1}\n".format(len(savedata), savename)) saveFile(ifile=savename, idata=Series(savedata), debug=True) ts.stop()
def processWikipediaYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movies = OrderedDict() for ifile in files: if debug: print("Processing {0}".format(ifile)) year = getBaseFilename(ifile) #if year == "1985": continue htmldata = getFile(ifile) bsdata = getHTML(htmldata) results = self.parseWikipediaOscarData(ifile, debug=False) if len(results) == 0: results = self.parseWikipediaOscarDataSpecial(ifile, debug=debug) if len(results) == 0: raise ValueError("No results for {0}".format(ifile)) for k, v in results.items(): print("====>", year, '\t', k) print(" Winner :", results[k]["Winner"]) if debug: print(" Nominees:", results[k]["Nominees"]) print("") savename = setFile(self.getResultsDir(), "{0}.json".format(year)) print("Saving {0} wikipedia oscar data to {1}".format( year, savename)) saveFile(savename, results)