コード例 #1
0
ファイル: movies.py プロジェクト: tgadf/movies
def correctOscarData():
    print "Checking for unparsed oscar data."
    backupfilename = setFile(getWikipediaDir(), "oscars.yaml.backup")    
    filename = setFile(getWikipediaDir(), "oscars.yaml")
    copyFile(filename, backupfilename)
    data     = get(filename)
    #fixes    = {}
    for year,ydata in data.iteritems():
        print "\n==>",year
        for cat,catdata in ydata.iteritems():
            
            winner = catdata["Winner"]
            if isinstance(winner, list):
                if winner[0].find(",") != -1:
                    print "\t",cat,"\t",winner[0]

            nominees = catdata["Nominees"]
            for nominee in nominees:
                if isinstance(nominee, list):
                    if nominee[0].find(",") != -1:
                        print "\t",cat,"\t",nominee[0]


    savename = setFile(getOscarDir(), "oscars.yaml")
    print "Saving",len(data),"yearly results to",savename
    save(savename, data)
コード例 #2
0
ファイル: movies.py プロジェクト: tgadf/movies
def getOscarData():    
    filename   = setFile(getOscarDir(), "oscars.yaml")
    data       = get(filename)
    yearlyData = {}
    for year,ydata in data.iteritems():
        
        movies = {}
        for category,categorydata in ydata.iteritems():
            if category.find("Song") != -1:
                continue
            sf = 1
            if category.find("Song") != -1:
                sf = 0
            elif category.find("Picture") != -1:
                sf = 40
            elif category.find("Animated Feature") != -1:
                sf = 35
            elif category.find("Director") != -1:
                sf = 30
            elif category.find("Actor") != -1 or category.find("Actress") != -1:
                sf = 25
            elif category.find("Screenplay") != -1:
                sf = 20
            winner = categorydata.get("Winner")
            if winner:
                #print category,'\t',winner
                if isinstance(winner, list):                    
                    movie = winner[0]
                else:
                    movie = winner
                    
                #print category,'\t',10*sf,'\t',winner
                if movies.get(movie) == None:
                    movies[movie] = 10*sf
                else:
                    movies[movie] = max(10*sf, movies[movie])
        
            nominees = categorydata.get("Nominees")
            if nominees:
                for nominee in nominees:
                    if isinstance(nominee, list):
                        movie = nominee[0]
                    else:
                        movie = nominee
                    
                    #print category,'\t',sf,'\t',winner
                    if movies.get(movie) == None:
                        movies[movie] = sf
                    else:
                        movies[movie] = max(sf, movies[movie])
        
        yearlyData[year] = sorted(movies.items(), key=operator.itemgetter(1), reverse=True)
        print "---->",year,"<----"
        for item in yearlyData[year][:15]:
            print item
        print '\n'
        
    savename = setFile(getOscarDir(), "oscars.json")
    print "Saving",len(yearlyData),"yearly results to",savename
    save(savename, yearlyData)
コード例 #3
0
ファイル: movies.py プロジェクト: tgadf/movies
def parseWikipediaOscarData1985(debug = True):
    results  = {}

    filename = setSubFile(getWikipediaDir(), "1985", "1985.dat")
    data     = get(filename)
    title    = None
    for line in data:
        if len(line) > 0 and title == None:
            title = line.replace("\t", "")
            title = title.strip()
            results[title] = {}
            continue
        if len(line) == 0:
            title = None
            continue
            if debug:
                print "      Winner  :",data[title]["Winner"]
                print "      Nominees:",data[title]["Nominees"]
                print ""               
        line = line.replace("\xe2\x80\x93", "::")
        vals = line.split(" :: ")
        vals = reorderWikipediaOscarData(vals, title)
        reorders = ["Best Director", "Best Actress", "Best Actor", 
                    "Best Supporting Actor", "Best Supporting Actress"]
        if title in reorders:
            vals[0] = vals[0].split(" as ")[0]

        if results[title].get("Winner") == None:
            results[title]["Winner"] = vals[0]
        else:
            if results[title].get("Nominees") == None:
                results[title]["Nominees"] = []
            results[title]["Nominees"].append(vals[0])

    return results            
コード例 #4
0
ファイル: movies.py プロジェクト: tgadf/movies
def processWikipediaOscarFiles(procYear = None):
    outdir = getWikipediaDir()
    if procYear == None:
        files = findSubExt(outdir, "data", ext=".p")
        #files = glob(join(outdir, "data", "*.p"))
    else:
        files = findSubPatternExt(outdir, "data", pattern=str(procYear), ext=".p")
        #files = glob(join(outdir, "data", str(procYear)+".p"))

    movies = OrderedDict()    
    for ifile in files:
        print ifile
        year    = getBaseFilename(ifile)
        print year
        #if year == "1985": continue
        htmldata = get(ifile)
        bsdata   = getHTML(htmldata)
        if int(year) <= 1984:
            results = parseWikipediaOscarDataPre1985(bsdata, True)
        elif int(year) >= 1986:
            results = parseWikipediaOscarDataPost1987(bsdata, True)
        else:
            results = parseWikipediaOscarData1985(debug = True)
        movies[year] = results
        for k,v in results.iteritems():
            print "====>",year,'\t',k
            print "      Winner  :",results[k]["Winner"]
            print "      Nominees:",results[k]["Nominees"]
            print ""

    savename = setFile(outdir, "oscars.yaml")
    print "Saving",len(movies),"years of wikipedia oscar data to",savename
    save(savename, movies)
コード例 #5
0
ファイル: movies.py プロジェクト: tgadf/movies
def processBoxOfficeMojo():
    outdir   = getBoxOfficeDir()
    savename = setFile(outdir, "results.json")
    
    data = get(savename)
    movies = {}
    yearlyData = {}
    for i,year in enumerate(data.keys()):
        movies[year] = {}
        ydata = data[year]
        for wdata in ydata:
            for mdata in wdata:
                movie  = mdata[2]
                retval = search("\((\d+)\)",movie)
                if retval:
                    stryear  = retval.group()
                    movie = movie.replace(stryear, "").strip()

                gross  = convertCurrency(mdata[9])
                weekly = convertCurrency(mdata[4])
                money  = max(gross, weekly)
                if movies[year].get(movie) == None:
                    movies[year][movie] = money
                else:                    
                    movies[year][movie] = max(money, movies[year][movie])

        yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True)
        print "---->",year,"<----"
        for item in yearlyData[year][:25]:
            print item
        print '\n'
        
    savename = setFile(outdir, "boxofficemojo.json")
    print "Saving",len(yearlyData),"yearly results to",savename
    save(savename, yearlyData)
コード例 #6
0
ファイル: movies.py プロジェクト: tgadf/movies
def parseBoxOfficeMojo(ifile):
    htmldata = get(ifile)
    bsdata   = getHTML(htmldata)
    tbl = None
    for table in bsdata.findAll("table"):
        if tbl:
            break
        for tr in table.findAll("tr"):
            if len(tr) >= 10:
                tbl = table
                break
            else:
                break
        
    #print len(tbl)
    keys = []
    data = []
    for i,tr in enumerate(tbl):
        vals = []
        if i == 0:
            for j,td in enumerate(tr.findAll("td")):
                for ref in td.findAll("a"):
                    key = ref.string
                    keys.append(key)
        else:
            if len(tr) <= 1: continue
            #print "\n\n\nNext...."
            #print tr
            #print "  tr-->",tr,'\t',len(tr)
            #print i,tr,len(data)
            for j,td in enumerate(tr.findAll("td")):
                if td.string == None:
                    continue
                try:
                    if search("TOTAL \((\d+) MOVIES\)", td.string):
                        break
                except:
                    print j,td.string
                    raise()
                key = keys[j]
                val = td.string
                vals.append(val)
                #print j,'\t',keys[j],'\t',td.string
            if len(vals) == 0: break
            if len(vals) != len(keys):
                print "Mismatch with keys/data"
                print len(keys),'\t',keys
                print len(vals),'\t',vals
                break
            else:
                data.append(vals)

    
    print "Found",len(data),"movies from",ifile            
    return data
コード例 #7
0
ファイル: movies.py プロジェクト: tgadf/movies
def mergeBoxOfficeMojoResults():
    outdir = getBoxOfficeDir()
    retval = {}
    files  = findSubExt(outdir, "results", ext=".json")
    for ifile in files:
        year = getBaseFilename(ifile)
        data = get(ifile)
        retval[year] = data
              
    savename = setFile(outdir, "results.json")
    print "Saving",len(retval),"years of movie data to",savename
    save(savename, retval)
コード例 #8
0
def loadConfig():
    configname = setFile("/Users/tgadfort/Documents/pymva", "config.yaml")
    info("Importing [{0}]".format(configname), ind=0)
    config = get(configname)
    return config
コード例 #9
0
ファイル: movies.py プロジェクト: tgadf/movies
def combineMovies(minOscarVal = 10, minRottenVal = 100, minBoxOfficeVal = 20e6,
                  keepIMAX = False, debug = False):
    outdir             = getMovieDir()
    
    oscarsFile         = setSubFile(outdir, "oscars", "oscars.json")
    boxofficeFile      = setSubFile(outdir, "boxoffice.com", "boxofficemojo.json")
    rottentomatoesFile = setSubFile(outdir, "rottentomatoes", "rottentomatoes.json")
    print oscarsFile
    print boxofficeFile
    print rottentomatoesFile
    
    oscarData          = get(oscarsFile)
    boxofficeData      = get(boxofficeFile)
    rottentomatoesData = get(rottentomatoesFile)
    
    movieCounter = {}    
    
    yearlyMovies = OrderedDict()
    years = sorted(list(set(oscarData.keys() + boxofficeData.keys() + rottentomatoesData.keys())))
    for year in years:        
        
        oscarMovies          = oscarData.get(year)
        boxofficeMovies      = boxofficeData.get(year)
        rottentomatoesMovies = rottentomatoesData.get(year)
        
        if oscarMovies:
            if debug: print year,'\t','Oscars         ','\t',len(oscarMovies),'\t',
            oscarMovies = [x[0] for x in oscarMovies if x[1] >= minOscarVal]
            if debug: print len(oscarMovies)
        else:
            oscarMovies = []
        
        if boxofficeMovies:
            if debug: print year,'\t','Box Office     ','\t',len(boxofficeMovies),'\t',
            boxofficeMovies = [x[0] for x in boxofficeMovies if x[1] >= minBoxOfficeVal]
            if debug: print len(boxofficeMovies)
        else:
            boxofficeMovies = []
            
        if rottentomatoesMovies:
            if debug: print year,'\t','Rotten Tomatoes','\t',len(rottentomatoesMovies),'\t',
            rottentomatoesMovies = [x[0] for x in rottentomatoesMovies if x[1] >= minRottenVal]
            if debug: print len(rottentomatoesMovies)
        else:
            rottentomatoesMovies = []    
                
        movies = OrderedDict()
        for movie in oscarMovies:
            movie = manualRenames(movie, year, keepIMAX)
            if movie:
                movies[movie] = "Oscar"
            
        for movie in boxofficeMovies:
            movie = manualRenames(movie, year, keepIMAX)
            if movie:
                if movies.get(movie): continue
                movies[movie] = "Box Office"
            
        for movie in rottentomatoesMovies:
            if movie:
                movie = manualRenames(movie, year, keepIMAX)
                if movies.get(movie): continue
                movies[movie] = "Rotten Tomatoes"

        for movie in movies.keys():
            uyear = unicode(str(int(year)), 'utf-8')
            testmovie = movie + u" ["+uyear+u"]"
            if movieCounter.get(testmovie):
                print "Removing",movie,"[",year,"] --->",movieCounter[testmovie]
                del movies[movie]
                continue

            stop = False
            for dyear in [1, -1, 2, -2]:
                uyear = unicode(str(int(year)+dyear), 'utf-8')
                testmovie2 = movie + u" ["+uyear+u"]"
                if movieCounter.get(testmovie2):
                    print "Removing",movie,"[",year,"] --->",movieCounter[testmovie2]
                    del movies[movie]
                    stop = True
                    break
            if stop:
                continue

            movieCounter[testmovie] = testmovie #movies[movie]
            
            
        yearlyMovies[year] = movies

        if debug: 
            print year,'\t','   ----->      ','\t\t',len(movies.keys())
            for movie,mtype in movies.iteritems():
                print      '\t','\t',movie,mtype

    for year in yearlyMovies.keys():
        for movie in yearlyMovies[year].keys():
            if movie.find(": ") != -1:
                print "    if movie == \""+movie+"\":"
                print "        return u\""+movie.replace(": ", " ")+"\""

                
    mergeYearlyMovies(outdir, yearlyMovies)
コード例 #10
0
ファイル: movies.py プロジェクト: tgadf/movies
def processSundanceData():
    files  = findSubExt(getSundanceDir(), "data", ext=".p")
    data   = OrderedDict()
    for ifile in files:
        htmldata = get(ifile)
        bsdata   = getHTML(htmldata)
        years    = []
        for h2 in bsdata.findAll("h2"):
            span = h2.find("span")
            try:
                year = int(span.string)
            except:
                continue
            years.append(year)

        for j,ul in enumerate(bsdata.findAll("ul")):
            try:
                year = years[j]
            except:
                break
            data[year] = {}
            lis = ul.findAll("li")
            for li in lis:                
                try:
                    txt    = li.text
                    txt    = re.sub("\xe2\x80\x93", " :: ", txt)
                    txt    = re.sub(u"(\u2018|\u2013)", " :: ", txt)
                except:
                    print "Error with",li
                    continue

                vals = txt.split(" :: ")
                if len(vals) > 2:
                    vals[1] = "-".join(vals[1:])
                    vals = vals[:2]
                vals = [x.strip() for x in vals]
                if len(vals) != 2:
                    raise ValueError(vals)

                    
                cat   = vals[0]
                movie = vals[1]
                
                if cat.find("Piper-Heidsieck") != -1:
                    continue
                
                if cat.find("Alfred P. Sloan") != -1:
                    cat = "Alfred P. Sloan Prize"

                
                if cat in ["World Cinema Dramatic Screenwriting Award",
                           "Sundance Institute/Mahindra Global Filmmaking Awards",
                           "World Cinema Documentary Editing Award",
                           "Excellence in Cinematography Award: Documentary",
                           "Excellence in Cinematography Award: Dramatic",
                           "World Cinema Cinematography Award: Documentary",
                           "World Cinema Cinematography Award: Dramatic",
                           "World Cinema Directing Award: Dramatic",
                           "World Cinema Directing Award: Documentary",
                           "World Dramatic Special Jury Prizes for Breakout Performances",
                           "Dramatic Special Jury Prize for Breakout Performance",
                           "Excellence in Cinematography Award Dramatic",
                           "xcellence in Cinematography Award Documentary",
                           "Documentary Editing Award",
                           "Waldo Salt Screenwriting Award: Dramatic",
                           "World Cinema Screenwriting Award",
                           "Directing Award Documentary",
                           "Directing Award Dramatic"]:
                    vals = movie.split(" for ")
                    if len(vals) == 2:
                        movie = vals[1]
                    elif len(vals) == 1:
                        movie = vals[0]
                    else:
                        print "Error in",cat,"===>",movie
                        continue

                if cat in ["Special Jury Prize for Acting"]:
                    movie = movie.replace("for her performance ", "")
                    vals = movie.split(" in ")
                    if len(vals) == 2:
                        movie = vals[1]
                    vals = movie.split(" for ")
                    if len(vals) == 2:
                        movie = vals[1]

                if movie.find("retitled") != -1:
                    movie = movie.split("retitled ")[1]
                    movie = movie[:-1]
                    
                movie = movie.replace(" (tie)", "")
                
                if movie.find(" director of ") != -1:
                    movie = movie.split(" director of ")[1]
                    
                
                print years[j],'\t',cat,'\t\t',movie,'\t\t'
                try:
                    data[year][str(cat)] = str(movie)
                except:
                    data[year][str(cat)] = movie

    savename = setFile(getSundanceDir(), "winners.yaml")
    print "Saving",len(data),"yearly results to",savename
    save(savename, data)