def predictTest(self, cursor, userId, clf): cursor.execute( ''' SELECT TestRatings.movieId, MovieYearGenres.year, genreBits, {0} FROM TestRatings join MovieYearGenres on TestRatings.movieId=MovieYearGenres.id join MovieTags on TestRatings.movieId=MovieTags.id where TestRatings.userId=?'''.format(','.join( ['tagBits' + str(i) for i in range(self.tagBitsCount)])), (userId, )) testingData = [ list(row[0:2]) + list(bitstring.Bits(int=row[2], length=len(self.ALL_GENRES))) + flatNestList( [list(bitstring.Bits(int=b, length=32)) for b in row[3:]]) for row in cursor.fetchall() ] testingData = np.array(testingData, dtype='int32') predictY = clf.predict(testingData[:, 1:]) toDB = predictY[:, None] toDB = np.insert(toDB, 1, userId, axis=1) toDB = np.insert(toDB, 2, testingData[:, 0], axis=1) cursor.executemany( 'update TestRatings set predict=? where userId=? and movieId=?', toDB.tolist())
def trainClassifier(self, cursor, userId, clf): cursor.execute( ''' SELECT Ratings.rating, MovieYearGenres.year, genreBits, {0} FROM Ratings join MovieYearGenres on Ratings.movieId=MovieYearGenres.id join MovieTags on Ratings.movieId=MovieTags.id where Ratings.userId=?'''.format(','.join( ['tagBits' + str(i) for i in range(self.tagBitsCount)])), (userId, )) trainingData = [ list(row[0:2]) + list(bitstring.Bits(int=row[2], length=len(self.ALL_GENRES))) + flatNestList( [list(bitstring.Bits(int=b, length=32)) for b in row[3:]]) for row in cursor.fetchall() ] trainingData = np.array(trainingData, dtype='int32') if len(trainingData) == 0: raise Exception( 'User {0} does not appear in training set.'.format(userId)) y = trainingData[:, 0] X = trainingData[:, 1:] return clf.fit(X, y)
def trainClassifier(self, cursor, userId, clf): # The default value of MoviePopularity.popularity is 5 because # select cast(sum(rating) as real) / count(*) from Ratings # returns 0.503819588139731 cursor.execute( ''' SELECT Ratings.rating, ifnull(cast(MoviePopularity.popularity*10 as integer),5), MovieYearGenres.year, genreBits, {0} FROM Ratings join MovieYearGenres on Ratings.movieId=MovieYearGenres.id join MovieTags on Ratings.movieId=MovieTags.id left join MoviePopularity on Ratings.movieId=MoviePopularity.movieId where Ratings.userId=?'''.format(','.join( ['tagBits' + str(i) for i in range(self.tagBitsCount)])), (userId, )) trainingData = [ list(row[0:3]) + list(bitstring.Bits(int=row[3], length=len(self.ALL_GENRES))) + flatNestList( [list(bitstring.Bits(int=b, length=32)) for b in row[4:]]) for row in cursor.fetchall() ] trainingData = np.array(trainingData, dtype='int32') if len(trainingData) == 0: raise Exception( 'User {0} does not appear in training set.'.format(userId)) y = trainingData[:, 0] X = trainingData[:, 1:] return clf.fit(X, y)
def classifyForUser(self, con, userId): cur = con.cursor() clf = RandomForestClassifier(n_estimators=100, max_depth=10) clf = self.trainClassifier(cur, userId, clf) cur.execute( ''' SELECT ValidationRatings.movieId, ifnull(cast(MoviePopularity.popularity*10 as integer),5), MovieYearGenres.year, genreBits, {0} FROM ValidationRatings join MovieYearGenres on ValidationRatings.movieId=MovieYearGenres.id join MovieTags on ValidationRatings.movieId=MovieTags.id left join MoviePopularity on ValidationRatings.movieId=MoviePopularity.movieId where ValidationRatings.userId=?'''.format(','.join( ['tagBits' + str(i) for i in range(self.tagBitsCount)])), (userId, )) validationData = [ list(row[0:3]) + list(bitstring.Bits(int=row[3], length=len(self.ALL_GENRES))) + flatNestList( [list(bitstring.Bits(int=b, length=32)) for b in row[4:]]) for row in cur.fetchall() ] validationData = np.array(validationData, dtype='int32') predictY = clf.predict(validationData[:, 1:]) toDB = predictY[:, None] toDB = np.insert(toDB, 1, userId, axis=1) toDB = np.insert(toDB, 2, validationData[:, 0], axis=1) cur.executemany( 'update ValidationRatings set predict=? where userId=? and movieId=?', toDB.tolist()) if cur.rowcount == 0: raise Exception("No rows are updated.") # tree.plot_tree(clf) # plt.show() con.commit() self.predictTest(cur, userId, clf) con.commit()