def run_test(username_='tathagata', samplesize=.10, K=5): """ username_ - user in smartplayer.user_rating db table samplesize_ - what percentage will be the cutoff for cross validation; example: .10 - 10% will be test and 90% will be train The sample set is changed per each interation of the KNN algorithm; Therefore, for a samplesize of .10 the algorithm will run 10 times while changing the subset which will be the testing set and which subset will be the training set. RETURN: Return values are the RMSE, correct value, incorrect value , respectively. """ print '--------- Running Tests and cross validation ----------' print 'Size of k = {0}'.format(K) print 'Cross validating with partition size: {0}'.format(samplesize) pred_d = dict() actual_d = dict() actual_d = get_data.get_user_rat(username_) video_list = glob.glob(videosloc_ + '*.wav') testnum_ = len(video_list) * samplesize correct = 0 incorrect = 0 RMSE_val = 0 while(len(video_list) != 0): test_l = [] for i in xrange(int(floor(testnum_))): test_l.append(video_list.pop().split('/')[-1].replace('.wav', '')) if(len(video_list) == 0): break # run tests pred_d = run_knn.run_all(username_, test_l, K) for song, pred in pred_d.iteritems(): print 'Actual {0}: Predicted: {1}'.format(actual_d[song], pred) if(int(actual_d[song]) == int(pred)): correct += 1 else: incorrect += 1 RMSE_val += (int(pred) - int(actual_d[song]))**2 del test_l #empty list RMSE = sqrt((RMSE_val)/len(actual_d)) return RMSE, correct, incorrect
def create_mf(userid_, querysongs_, dir_): """ userid_ - userid for user in smartplayer database querysongs_ - list of song(s) that need ratings dir - dir is a directory that contains all of the songs/videos in .wav format Create mf file that contain these songs with either a like or a dislike with them. This what an mf file should look like: /User/Rob/files/rap1.wav 1 /User/Rob/files/rap2.wav 1 /User/Rob/files/rap3.wav 1 /User/Rob/files/rap4.wav 0 Returns - name of file written to """ new_filename = userid_ + '.mf' if dir_.endswith('/') == False: dir_ += '/' mffile_ = open(new_filename, 'w') """ query database for all songs that the user liked or disliked and put then in a list """ D_videos = get_data.get_user_rat(userid_) for song in querysongs_: songpath = dir_ + song + '.wav' if os.path.isfile(songpath): mffile_.write(songpath + '\t0\n') D_videos[song] = 0 # test song has rating 0 for song, rating in D_videos.iteritems(): if song in querysongs_: # don't include test songs continue songpath = dir_ + song + '.wav' if os.path.isfile(songpath): mffile_.write(songpath + '\t' + str(rating) + '\n') return new_filename, D_videos
def run_all_ks(username_='tathagata', samplesize_=0.1): """ Run for k | k = 1 to sqrt(n) """ k_d = {} corr_d = {} incorr_d = {} actual_d = get_data.get_user_rat(username_) upto = int(sqrt(len(actual_d))) for k in xrange(1, upto): k_d[k], corr_d[k], incorr_d[k] = run_test(username_, samplesize_, k) if(k == 1): bestk = k low_incorr_num = incorr_d[k] elif(incorr_d[k] < low_incorr_num): bestk = k print 'for k = {0} the RMSE was {1}'.format(str(k), str(k_d[k])) print 'for k = {0} the correct was {1}'.format(str(k), str(corr_d[k])) print 'for k = {0} the incorrect was {1}'.format(str(k), str(incorr_d[k])) print 'Best value of k to use is {0}'.format(str(bestk))
def run_all_ks(username_="tathagata", samplesize_=0.1): """ Run for k | k = 1 to sqrt(n) """ k_d = {} corr_d = {} incorr_d = {} actual_d = get_data.get_user_rat(username_) upto = int(sqrt(len(actual_d))) for k in xrange(1, upto): k_d[k], corr_d[k], incorr_d[k] = run_test(username_, samplesize_, k) if k == 1 or incorr_d[k] < low_incorr_num: bestk = k low_incorr_num = incorr_d[k] low_RMSE = k_d[k] print "for k = {0} the RMSE was {1}".format(str(k), str(k_d[k])) print "for k = {0} the correct was {1}".format(str(k), str(corr_d[k])) print "for k = {0} the incorrect was {1}".format(str(k), str(incorr_d[k])) print "Best value of k to use is {0}".format(str(bestk)) print "Number of incorrect for this k was {0}".format(str(low_incorr_num)) print "Lowest RMSE was {0}".format(str(low_RMSE))