def correctOscarData(): print "Checking for unparsed oscar data." backupfilename = setFile(getWikipediaDir(), "oscars.yaml.backup") filename = setFile(getWikipediaDir(), "oscars.yaml") copyFile(filename, backupfilename) data = get(filename) #fixes = {} for year,ydata in data.iteritems(): print "\n==>",year for cat,catdata in ydata.iteritems(): winner = catdata["Winner"] if isinstance(winner, list): if winner[0].find(",") != -1: print "\t",cat,"\t",winner[0] nominees = catdata["Nominees"] for nominee in nominees: if isinstance(nominee, list): if nominee[0].find(",") != -1: print "\t",cat,"\t",nominee[0] savename = setFile(getOscarDir(), "oscars.yaml") print "Saving",len(data),"yearly results to",savename save(savename, data)
def getOscarData(): filename = setFile(getOscarDir(), "oscars.yaml") data = get(filename) yearlyData = {} for year,ydata in data.iteritems(): movies = {} for category,categorydata in ydata.iteritems(): if category.find("Song") != -1: continue sf = 1 if category.find("Song") != -1: sf = 0 elif category.find("Picture") != -1: sf = 40 elif category.find("Animated Feature") != -1: sf = 35 elif category.find("Director") != -1: sf = 30 elif category.find("Actor") != -1 or category.find("Actress") != -1: sf = 25 elif category.find("Screenplay") != -1: sf = 20 winner = categorydata.get("Winner") if winner: #print category,'\t',winner if isinstance(winner, list): movie = winner[0] else: movie = winner #print category,'\t',10*sf,'\t',winner if movies.get(movie) == None: movies[movie] = 10*sf else: movies[movie] = max(10*sf, movies[movie]) nominees = categorydata.get("Nominees") if nominees: for nominee in nominees: if isinstance(nominee, list): movie = nominee[0] else: movie = nominee #print category,'\t',sf,'\t',winner if movies.get(movie) == None: movies[movie] = sf else: movies[movie] = max(sf, movies[movie]) yearlyData[year] = sorted(movies.items(), key=operator.itemgetter(1), reverse=True) print "---->",year,"<----" for item in yearlyData[year][:15]: print item print '\n' savename = setFile(getOscarDir(), "oscars.json") print "Saving",len(yearlyData),"yearly results to",savename save(savename, yearlyData)
def parseWikipediaOscarData1985(debug = True): results = {} filename = setSubFile(getWikipediaDir(), "1985", "1985.dat") data = get(filename) title = None for line in data: if len(line) > 0 and title == None: title = line.replace("\t", "") title = title.strip() results[title] = {} continue if len(line) == 0: title = None continue if debug: print " Winner :",data[title]["Winner"] print " Nominees:",data[title]["Nominees"] print "" line = line.replace("\xe2\x80\x93", "::") vals = line.split(" :: ") vals = reorderWikipediaOscarData(vals, title) reorders = ["Best Director", "Best Actress", "Best Actor", "Best Supporting Actor", "Best Supporting Actress"] if title in reorders: vals[0] = vals[0].split(" as ")[0] if results[title].get("Winner") == None: results[title]["Winner"] = vals[0] else: if results[title].get("Nominees") == None: results[title]["Nominees"] = [] results[title]["Nominees"].append(vals[0]) return results
def processWikipediaOscarFiles(procYear = None): outdir = getWikipediaDir() if procYear == None: files = findSubExt(outdir, "data", ext=".p") #files = glob(join(outdir, "data", "*.p")) else: files = findSubPatternExt(outdir, "data", pattern=str(procYear), ext=".p") #files = glob(join(outdir, "data", str(procYear)+".p")) movies = OrderedDict() for ifile in files: print ifile year = getBaseFilename(ifile) print year #if year == "1985": continue htmldata = get(ifile) bsdata = getHTML(htmldata) if int(year) <= 1984: results = parseWikipediaOscarDataPre1985(bsdata, True) elif int(year) >= 1986: results = parseWikipediaOscarDataPost1987(bsdata, True) else: results = parseWikipediaOscarData1985(debug = True) movies[year] = results for k,v in results.iteritems(): print "====>",year,'\t',k print " Winner :",results[k]["Winner"] print " Nominees:",results[k]["Nominees"] print "" savename = setFile(outdir, "oscars.yaml") print "Saving",len(movies),"years of wikipedia oscar data to",savename save(savename, movies)
def processBoxOfficeMojo(): outdir = getBoxOfficeDir() savename = setFile(outdir, "results.json") data = get(savename) movies = {} yearlyData = {} for i,year in enumerate(data.keys()): movies[year] = {} ydata = data[year] for wdata in ydata: for mdata in wdata: movie = mdata[2] retval = search("\((\d+)\)",movie) if retval: stryear = retval.group() movie = movie.replace(stryear, "").strip() gross = convertCurrency(mdata[9]) weekly = convertCurrency(mdata[4]) money = max(gross, weekly) if movies[year].get(movie) == None: movies[year][movie] = money else: movies[year][movie] = max(money, movies[year][movie]) yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True) print "---->",year,"<----" for item in yearlyData[year][:25]: print item print '\n' savename = setFile(outdir, "boxofficemojo.json") print "Saving",len(yearlyData),"yearly results to",savename save(savename, yearlyData)
def parseBoxOfficeMojo(ifile): htmldata = get(ifile) bsdata = getHTML(htmldata) tbl = None for table in bsdata.findAll("table"): if tbl: break for tr in table.findAll("tr"): if len(tr) >= 10: tbl = table break else: break #print len(tbl) keys = [] data = [] for i,tr in enumerate(tbl): vals = [] if i == 0: for j,td in enumerate(tr.findAll("td")): for ref in td.findAll("a"): key = ref.string keys.append(key) else: if len(tr) <= 1: continue #print "\n\n\nNext...." #print tr #print " tr-->",tr,'\t',len(tr) #print i,tr,len(data) for j,td in enumerate(tr.findAll("td")): if td.string == None: continue try: if search("TOTAL \((\d+) MOVIES\)", td.string): break except: print j,td.string raise() key = keys[j] val = td.string vals.append(val) #print j,'\t',keys[j],'\t',td.string if len(vals) == 0: break if len(vals) != len(keys): print "Mismatch with keys/data" print len(keys),'\t',keys print len(vals),'\t',vals break else: data.append(vals) print "Found",len(data),"movies from",ifile return data
def mergeBoxOfficeMojoResults(): outdir = getBoxOfficeDir() retval = {} files = findSubExt(outdir, "results", ext=".json") for ifile in files: year = getBaseFilename(ifile) data = get(ifile) retval[year] = data savename = setFile(outdir, "results.json") print "Saving",len(retval),"years of movie data to",savename save(savename, retval)
def loadConfig(): configname = setFile("/Users/tgadfort/Documents/pymva", "config.yaml") info("Importing [{0}]".format(configname), ind=0) config = get(configname) return config
def combineMovies(minOscarVal = 10, minRottenVal = 100, minBoxOfficeVal = 20e6, keepIMAX = False, debug = False): outdir = getMovieDir() oscarsFile = setSubFile(outdir, "oscars", "oscars.json") boxofficeFile = setSubFile(outdir, "boxoffice.com", "boxofficemojo.json") rottentomatoesFile = setSubFile(outdir, "rottentomatoes", "rottentomatoes.json") print oscarsFile print boxofficeFile print rottentomatoesFile oscarData = get(oscarsFile) boxofficeData = get(boxofficeFile) rottentomatoesData = get(rottentomatoesFile) movieCounter = {} yearlyMovies = OrderedDict() years = sorted(list(set(oscarData.keys() + boxofficeData.keys() + rottentomatoesData.keys()))) for year in years: oscarMovies = oscarData.get(year) boxofficeMovies = boxofficeData.get(year) rottentomatoesMovies = rottentomatoesData.get(year) if oscarMovies: if debug: print year,'\t','Oscars ','\t',len(oscarMovies),'\t', oscarMovies = [x[0] for x in oscarMovies if x[1] >= minOscarVal] if debug: print len(oscarMovies) else: oscarMovies = [] if boxofficeMovies: if debug: print year,'\t','Box Office ','\t',len(boxofficeMovies),'\t', boxofficeMovies = [x[0] for x in boxofficeMovies if x[1] >= minBoxOfficeVal] if debug: print len(boxofficeMovies) else: boxofficeMovies = [] if rottentomatoesMovies: if debug: print year,'\t','Rotten Tomatoes','\t',len(rottentomatoesMovies),'\t', rottentomatoesMovies = [x[0] for x in rottentomatoesMovies if x[1] >= minRottenVal] if debug: print len(rottentomatoesMovies) else: rottentomatoesMovies = [] movies = OrderedDict() for movie in oscarMovies: movie = manualRenames(movie, year, keepIMAX) if movie: movies[movie] = "Oscar" for movie in boxofficeMovies: movie = manualRenames(movie, year, keepIMAX) if movie: if movies.get(movie): continue movies[movie] = "Box Office" for movie in rottentomatoesMovies: if movie: movie = manualRenames(movie, year, keepIMAX) if movies.get(movie): continue movies[movie] = "Rotten Tomatoes" for movie in movies.keys(): uyear = unicode(str(int(year)), 'utf-8') testmovie = movie + u" ["+uyear+u"]" if movieCounter.get(testmovie): print "Removing",movie,"[",year,"] --->",movieCounter[testmovie] del movies[movie] continue stop = False for dyear in [1, -1, 2, -2]: uyear = unicode(str(int(year)+dyear), 'utf-8') testmovie2 = movie + u" ["+uyear+u"]" if movieCounter.get(testmovie2): print "Removing",movie,"[",year,"] --->",movieCounter[testmovie2] del movies[movie] stop = True break if stop: continue movieCounter[testmovie] = testmovie #movies[movie] yearlyMovies[year] = movies if debug: print year,'\t',' -----> ','\t\t',len(movies.keys()) for movie,mtype in movies.iteritems(): print '\t','\t',movie,mtype for year in yearlyMovies.keys(): for movie in yearlyMovies[year].keys(): if movie.find(": ") != -1: print " if movie == \""+movie+"\":" print " return u\""+movie.replace(": ", " ")+"\"" mergeYearlyMovies(outdir, yearlyMovies)
def processSundanceData(): files = findSubExt(getSundanceDir(), "data", ext=".p") data = OrderedDict() for ifile in files: htmldata = get(ifile) bsdata = getHTML(htmldata) years = [] for h2 in bsdata.findAll("h2"): span = h2.find("span") try: year = int(span.string) except: continue years.append(year) for j,ul in enumerate(bsdata.findAll("ul")): try: year = years[j] except: break data[year] = {} lis = ul.findAll("li") for li in lis: try: txt = li.text txt = re.sub("\xe2\x80\x93", " :: ", txt) txt = re.sub(u"(\u2018|\u2013)", " :: ", txt) except: print "Error with",li continue vals = txt.split(" :: ") if len(vals) > 2: vals[1] = "-".join(vals[1:]) vals = vals[:2] vals = [x.strip() for x in vals] if len(vals) != 2: raise ValueError(vals) cat = vals[0] movie = vals[1] if cat.find("Piper-Heidsieck") != -1: continue if cat.find("Alfred P. Sloan") != -1: cat = "Alfred P. Sloan Prize" if cat in ["World Cinema Dramatic Screenwriting Award", "Sundance Institute/Mahindra Global Filmmaking Awards", "World Cinema Documentary Editing Award", "Excellence in Cinematography Award: Documentary", "Excellence in Cinematography Award: Dramatic", "World Cinema Cinematography Award: Documentary", "World Cinema Cinematography Award: Dramatic", "World Cinema Directing Award: Dramatic", "World Cinema Directing Award: Documentary", "World Dramatic Special Jury Prizes for Breakout Performances", "Dramatic Special Jury Prize for Breakout Performance", "Excellence in Cinematography Award Dramatic", "xcellence in Cinematography Award Documentary", "Documentary Editing Award", "Waldo Salt Screenwriting Award: Dramatic", "World Cinema Screenwriting Award", "Directing Award Documentary", "Directing Award Dramatic"]: vals = movie.split(" for ") if len(vals) == 2: movie = vals[1] elif len(vals) == 1: movie = vals[0] else: print "Error in",cat,"===>",movie continue if cat in ["Special Jury Prize for Acting"]: movie = movie.replace("for her performance ", "") vals = movie.split(" in ") if len(vals) == 2: movie = vals[1] vals = movie.split(" for ") if len(vals) == 2: movie = vals[1] if movie.find("retitled") != -1: movie = movie.split("retitled ")[1] movie = movie[:-1] movie = movie.replace(" (tie)", "") if movie.find(" director of ") != -1: movie = movie.split(" director of ")[1] print years[j],'\t',cat,'\t\t',movie,'\t\t' try: data[year][str(cat)] = str(movie) except: data[year][str(cat)] = movie savename = setFile(getSundanceDir(), "winners.yaml") print "Saving",len(data),"yearly results to",savename save(savename, data)