コード例 #1
def run_test(username_='tathagata', samplesize=.10, K=5):
    username_ - user in smartplayer.user_rating db table
    samplesize_ - what percentage will be the cutoff for cross validation;
        .10 - 10% will be test and 90% will be train
    The sample set is changed per each interation of the KNN algorithm;
    Therefore, for a samplesize of .10 the algorithm will run
    10 times while changing the subset which will be the testing set
    and which subset will be the training set.
        Return values are the RMSE, correct value, incorrect value , respectively.
    print '--------- Running Tests and cross validation ----------'
    print 'Size of k = {0}'.format(K)
    print 'Cross validating with partition size: {0}'.format(samplesize)
    pred_d = dict()
    actual_d = dict()
    actual_d = get_data.get_user_rat(username_)
    video_list = glob.glob(videosloc_ + '*.wav')
    testnum_ = len(video_list) * samplesize
    correct = 0
    incorrect = 0
    RMSE_val = 0
    while(len(video_list) != 0):
        test_l = []
        for i in xrange(int(floor(testnum_))):
            test_l.append(video_list.pop().split('/')[-1].replace('.wav', ''))
            if(len(video_list) == 0):
        # run tests
        pred_d = run_knn.run_all(username_, test_l, K)
        for song, pred in pred_d.iteritems():
            print 'Actual {0}: Predicted: {1}'.format(actual_d[song], pred)
            if(int(actual_d[song]) == int(pred)):
                correct += 1
                incorrect += 1
            RMSE_val += (int(pred) - int(actual_d[song]))**2
        del test_l #empty list

    RMSE = sqrt((RMSE_val)/len(actual_d))                 
    return RMSE, correct, incorrect
コード例 #2
def create_mf(userid_, querysongs_, dir_):
    userid_ - userid for user in smartplayer database
    querysongs_ - list of song(s) that need ratings
    dir - dir is a directory that contains all of the songs/videos
          in .wav format

    Create mf file that contain these songs with either
    a like or a dislike with them.
    This what an mf file should look like:

    /User/Rob/files/rap1.wav    1
    /User/Rob/files/rap2.wav    1
    /User/Rob/files/rap3.wav    1
    /User/Rob/files/rap4.wav    0

    Returns - name of file written to
    new_filename = userid_ + '.mf'
    if dir_.endswith('/') == False:
        dir_ += '/'
    mffile_ = open(new_filename, 'w')

    query database for all songs that the user liked or
    disliked and put then in a list

    D_videos = get_data.get_user_rat(userid_)
    for song in querysongs_:
        songpath = dir_ + song + '.wav'
        if os.path.isfile(songpath):
            mffile_.write(songpath + '\t0\n')
            D_videos[song] = 0  # test song has rating 0
    for song, rating in D_videos.iteritems():
        if song in querysongs_: # don't include test songs 
        songpath = dir_ + song + '.wav'
        if os.path.isfile(songpath):
            mffile_.write(songpath + '\t' + str(rating) + '\n')

    return new_filename, D_videos
コード例 #3
def run_all_ks(username_='tathagata', samplesize_=0.1):
    Run for k | k = 1 to sqrt(n)
    k_d = {}
    corr_d = {}
    incorr_d = {}
    actual_d = get_data.get_user_rat(username_)
    upto = int(sqrt(len(actual_d)))
    for k in xrange(1, upto):
        k_d[k], corr_d[k], incorr_d[k] = run_test(username_, samplesize_, k)
        if(k == 1):
            bestk = k
            low_incorr_num =  incorr_d[k]
        elif(incorr_d[k] < low_incorr_num):
            bestk = k
        print 'for k = {0} the RMSE was {1}'.format(str(k), str(k_d[k]))
        print 'for k = {0} the correct was {1}'.format(str(k), str(corr_d[k]))
        print 'for k = {0} the incorrect was {1}'.format(str(k), str(incorr_d[k]))
    print 'Best value of k to use is {0}'.format(str(bestk))                                                                            
コード例 #4
def run_all_ks(username_="tathagata", samplesize_=0.1):
    Run for k | k = 1 to sqrt(n)
    k_d = {}
    corr_d = {}
    incorr_d = {}
    actual_d = get_data.get_user_rat(username_)
    upto = int(sqrt(len(actual_d)))
    for k in xrange(1, upto):
        k_d[k], corr_d[k], incorr_d[k] = run_test(username_, samplesize_, k)
        if k == 1 or incorr_d[k] < low_incorr_num:
            bestk = k
            low_incorr_num = incorr_d[k]
            low_RMSE = k_d[k]
        print "for k = {0} the RMSE was {1}".format(str(k), str(k_d[k]))
        print "for k = {0} the correct was {1}".format(str(k), str(corr_d[k]))
        print "for k = {0} the incorrect was {1}".format(str(k), str(incorr_d[k]))
    print "Best value of k to use is {0}".format(str(bestk))
    print "Number of incorrect for this k was {0}".format(str(low_incorr_num))
    print "Lowest RMSE was {0}".format(str(low_RMSE))