Beispiel #1
0
def netflix_decade_avg (trainingSetDir, w = sys.stdout):
    """
    Compute customer averages per decade that the movie was created.
    Print to standard out or redirect to file using "> extra/movieDecadeAvgRatings.in"
    trainingSetDir is the path to training_set/ from the command line
    w is a writer
    """
    assert trainingSetDir
    movieIDYear = netflix_parse_precomputed('extra/movie_titles_no_nulls.txt', ',')
    
    # Build dict of dict of list {custID: {decade:[totalRating, numRatings]}}
    custIDDecade = {}
    for file in glob.glob(os.path.join(trainingSetDir, 'mv_*.txt')) :
        #print file
        with open(file, 'r') as f_myfile:
            lines = f_myfile.readlines()
            movieID = lines[0].strip(':\r\n')
            for custIDRatingDateLine in lines[1:] :
                decadeDict = {'1890s':[0, 0],'1900s':[0, 0],'1910s':[0, 0],'1920s':[0, 0],'1930s':[0, 0],'1940s':[0, 0],'1950s':[0, 0],'1960s':[0, 0],'1970s':[0, 0],'1980s':[0, 0],'1990s':[0, 0],'2000s':[0, 0]}
                #get custID and actual rating
                custIDRatingDateList = custIDRatingDateLine.strip().split(',')
                custID = custIDRatingDateList[0]
                rating = float(custIDRatingDateList[1])
                assert 1.0 <= rating <= 5.0
                
                # Initialize dictionary 
                if not custID in custIDDecade:
                    custIDDecade[custID] = decadeDict
                else : # custID entry already exists, so add to that dict
                    decadeDict = custIDDecade[custID]
                
                #look up year
                year = movieIDYear[movieID]
                
                #determine the decade 
                decade = netflix_decade_calc(year)
                
                #add to that decade's [totalRating, numRatings]
                totalRatingNumRatingList = decadeDict[decade]
                totalRatingNumRatingList[0] += rating #totalRating
                totalRatingNumRatingList[1] += 1      #numRatings
                decadeDict[decade] = totalRatingNumRatingList
                custIDDecade[custID] = decadeDict
                
    # compute averages for each decade  
    for custID, decadeDict in sorted(custIDDecade.items()) :
        w.write( custID + ":\n" )
        for decade, totalRatingNumRatingList in sorted(decadeDict.items()) :
            totalRating = totalRatingNumRatingList[0]
            if totalRating == 0 : #customer didn't rate any movies of that decade
                continue
            numRating = totalRatingNumRatingList[1]
            avgRating = totalRating / numRating
            w.write( decade + "=" + str(avgRating) + "\n")
Beispiel #2
0
 def test_decade2(self):
     year = '2005'
     self.assert_(netflix_decade_calc(year) == '2000s')
Beispiel #3
0
 def test_decade3(self):
     year = '1989'
     self.assert_(netflix_decade_calc(year) == '1980s')
Beispiel #4
0
 def test_decade(self):
     year = '1890'
     self.assert_(netflix_decade_calc(year) == '1890s')