def predict(movie_id, nbmodel, imdb_db): """ returns the predicted class labels (defined above) for user-rating and gross-budget-mult by choosing the class with the highest posterior probability """ # generate custom movie dict by ingesting features from the database movie = hydrate(movie_id, imdb_db, MAX_ACTORS) true_rating = movie['rating'] true_bmult = movie['bmult'] # initialize posteriors # pos_rating = [0] * len(BINS_RATING) # pos_bmult = [0] * len(BINS_BMULT) pos_rating = {} pos_bmult = {} for feat in FEATURES: for id_ in movie[feat]: for br in BINS_RATING: stmt = "nbmodel['%s'].setdefault(id_,{}).setdefault(br,0)" % feat val = eval(stmt) try: pos_rating[br] += val except KeyError: pos_rating[br] = val for bm in BINS_BMULT: stmt = "nbmodel['%s'].setdefault(id_,{}).setdefault(bm,0)" % feat val = eval(stmt) try: pos_bmult[bm] += val except KeyError: pos_bmult[bm] = val # add class priors for br in BINS_RATING: pos_rating[br] += nbmodel['rating'][br] for bm in BINS_BMULT: pos_bmult[bm] += nbmodel['bmult'][bm] pred_rating = max(pos_rating.iteritems(), key=operator.itemgetter(1))[0] pred_bmult = max(pos_bmult.iteritems(), key=operator.itemgetter(1))[0] return ([true_rating, pred_rating], [true_bmult, pred_bmult])
sys.stdout.write('Loading imdb.db... ') sys.stdout.flush() ia = imdb.IMDb('sql', uri=db_uri) sys.stdout.write('[done]\n') # all pruning will be done in movielist mlist = open(MOVIE_FILE, 'r') mov_id = mlist.readline().strip() max_budget = 0 # for normalization purposes while mov_id != '': sys.stdout.write('Reading movie #' + mov_id + ': ') sys.stdout.flush() movie = hydrate(mov_id, ia, MAX_ACTORS) sys.stdout.write(movie['title']) sys.stdout.flush() # initialize feature vector current_fv = [0]*FV_LENGTH # generate output labels rating_labels.append(BINS_RATING.index(movie['rating'])) bmult_labels.append(BINS_BMULT.index(movie['bmult'])) # Populate feature vector ''' for actor_id in iter(movie['actor']): current_fv[PERSON_OFFSET + person_fvid[actor_id]] = 1