def netflix_decade_avg (trainingSetDir, w = sys.stdout): """ Compute customer averages per decade that the movie was created. Print to standard out or redirect to file using "> extra/movieDecadeAvgRatings.in" trainingSetDir is the path to training_set/ from the command line w is a writer """ assert trainingSetDir movieIDYear = netflix_parse_precomputed('extra/movie_titles_no_nulls.txt', ',') # Build dict of dict of list {custID: {decade:[totalRating, numRatings]}} custIDDecade = {} for file in glob.glob(os.path.join(trainingSetDir, 'mv_*.txt')) : #print file with open(file, 'r') as f_myfile: lines = f_myfile.readlines() movieID = lines[0].strip(':\r\n') for custIDRatingDateLine in lines[1:] : decadeDict = {'1890s':[0, 0],'1900s':[0, 0],'1910s':[0, 0],'1920s':[0, 0],'1930s':[0, 0],'1940s':[0, 0],'1950s':[0, 0],'1960s':[0, 0],'1970s':[0, 0],'1980s':[0, 0],'1990s':[0, 0],'2000s':[0, 0]} #get custID and actual rating custIDRatingDateList = custIDRatingDateLine.strip().split(',') custID = custIDRatingDateList[0] rating = float(custIDRatingDateList[1]) assert 1.0 <= rating <= 5.0 # Initialize dictionary if not custID in custIDDecade: custIDDecade[custID] = decadeDict else : # custID entry already exists, so add to that dict decadeDict = custIDDecade[custID] #look up year year = movieIDYear[movieID] #determine the decade decade = netflix_decade_calc(year) #add to that decade's [totalRating, numRatings] totalRatingNumRatingList = decadeDict[decade] totalRatingNumRatingList[0] += rating #totalRating totalRatingNumRatingList[1] += 1 #numRatings decadeDict[decade] = totalRatingNumRatingList custIDDecade[custID] = decadeDict # compute averages for each decade for custID, decadeDict in sorted(custIDDecade.items()) : w.write( custID + ":\n" ) for decade, totalRatingNumRatingList in sorted(decadeDict.items()) : totalRating = totalRatingNumRatingList[0] if totalRating == 0 : #customer didn't rate any movies of that decade continue numRating = totalRatingNumRatingList[1] avgRating = totalRating / numRating w.write( decade + "=" + str(avgRating) + "\n")
def test_parse_precomputed2(self): file = 'test/precomputedTest2.txt' d = netflix_parse_precomputed(file) self.assert_(d == {'1': '1.4242', '3': '3.6908', '2': '2.4515', '5': '5.2818', '4': '4.8711'})
def test_parse_precomputed3(self): file = 'test/precomputedTest3.txt' d = netflix_parse_precomputed(file, ",") self.assert_(d == {'10': '1993', '1': '2002', '3': '1903', '2': '1898', '5': '1984', '4': '1940', '7': '1934', '6': '1938', '9': '1999', '8': '1967'})
def test_parse_precomputed(self): file = 'test/precomputedTest.txt' d = netflix_parse_precomputed(file) self.assert_(d == {'1': '1'})