Example #1
0
def compute_differences(d1, d2, test_words):
    ups_max = []
    ups_n = []
    rhos_max = []
    rhos_n = []
    n_var = parameters["evaluation n value"]
    for w in sorted(test_words):
        n_max = min(len(d1[w]), len(d2[w]))
        tw1_sorted = sorted(d1[w].items(), key=lambda x: (-x[1], x[0]))
        tw2_sorted = sorted(d2[w].items(), key=lambda x: (-x[1], x[0]))
        lw1 = [e[0] for e in tw1_sorted]
        lw2 = [e[0] for e in tw2_sorted]
        dw1_max = normalize_dict(dict(tw1_sorted[:n_max]))
        dw2_max = normalize_dict(dict(tw2_sorted[:n_max]))
        dw1_n = normalize_dict(dict(tw1_sorted[:n_var]))
        dw2_n = normalize_dict(dict(tw2_sorted[:n_var]))
        upsilon_max = 0.5 * sum(abs((dw1_max.get(resp) or 0) - (dw2_max.get(resp) or 0)) for resp in sorted(set(dw1_max) | set(dw2_max)))
        upsilon_n = 0.5 * sum(abs((dw1_n.get(resp) or 0) - (dw2_n.get(resp) or 0)) for resp in sorted(set(dw1_n) | set(dw2_n)))

        sl, ll = sorted([(len(lw1), lw1), (len(lw2), lw2)])
        s, S = sl
        l, L = ll
        rho_max = 1 - metrics.apk(S, L, s)
        rho_n = 1 - metrics.apk(S[:n_var], L, n_var)

        # ups_max.append((w, upsilon_max))
        # ups_n.append((w, upsilon_n))
        # rhos_max.append((w, rho_max))
        # rhos_n.append((w, rho_n))
        ups_max.append(upsilon_max)
        ups_n.append(upsilon_n)
        rhos_max.append(rho_max)
        rhos_n.append(rho_n)
    return ups_max, ups_n, rhos_max, rhos_n
 def test_apk(self):
     print metrics.apk(range(1, 6), [6, 4, 7, 1, 2], 2)
     self.assertAlmostEqual(metrics.apk(range(1, 6), [1, 1, 1, 1, 1], 5),
                            0.2)
     predicted = range(1, 21)
     predicted.extend(range(200, 600))
     self.assertAlmostEqual(metrics.apk(range(1, 100), predicted, 20), 1.0)
Example #3
0
def mapk(truth, predict, k=5):
    count = 0
    sum = 0
    for i, v in enumerate(truth):
        sum = sum + ml_metrics.apk([v], predict[i], k)
        count = count + 1
    return float(sum)/count
Example #4
0
def average_precision(sorted_predictions, holdout_items, k=5):
    """compute average precision for a single user"""
    predicted_items = [pair[0] for pair in sorted_predictions]
    apk = ml_metrics.apk(holdout_items, predicted_items, k=k)
    frame = pd.DataFrame({
        'user_id': [user_recommendations['user_id'].values[0]],
        'average_precision': [apk]
    })
    return frame
Example #5
0
def apk_via_inds_orders(pos_inds, order, k=10):
    # Ranking for apk
    # Note: Usually we would use
    #   ```
    #   topk_part = np.argpartition(-y_p, k)[:k]  # topk partition
    #   topk = topk_part[np.argsort(-y_p[topk_part])]  # recompute actual
    #   ```
    # which is faster than `np.argsort(-y_p)[:k]`
    # however, we want `order` which can be reused for other metrics (MRR)
    # No need to truncate `order[:k]`
    score_apk = apk(list(pos_inds), order, k=k)
    return score_apk
Example #6
0
def compute_mapk(gt, hypo, k_val):
    if list_depth(hypo) == 2:
        hypo = add_list_level(hypo.copy())
    apk_list = []
    for ii, query in enumerate(gt):
        for jj, sq in enumerate(query):
            apk_val = 0.0
            if len(hypo[ii]) > jj:
                apk_val = apk([sq], hypo[ii][jj], k_val)
            apk_list.append(apk_val)

    return np.mean(apk_list)
def main(k1, b, k3, feature, query_feature, r):
    file_list = open(model_dir + "/file-list")
    file = file_list.read().split("\n")
    file.remove("")
    num_doc = len(file)

    file_dic = dict()
    for i in range(num_doc):
        term = file[i].split("/")[-1].lower()
        file_dic[term] = i

    inf_file = open(model_dir + "/inverted-file")
    inf = inf_file.read()
    del inf_file
    inf = inf.split("\n")
    inf.remove("")

    dictionaries, que_dic, select_voc, num_term, que = build_term_dic(
        model_dir, query_dir, num, k3=k3, F=feature, QF=query_feature)
    D = build_doc_vector(num_doc, dictionaries, inf, select_voc, k1=k1, b=b)
    #pca = PCA(n_components=int(num_term*0.9), svd_solver='full')
    #D_=pca.fit_transform(D)
    D_ = D
    if if_train:
        df = pd.read_csv("queries/ans_train.csv")
        truth = []
        for i in range(num):
            app = []
            ret = df["retrieved_docs"][i].split(" ")
            for ele in ret:
                app.append(file_dic[ele])
            truth.append(app)
        w_file = open("score.txt", "a")
        for i in r:
            train_rank = output(
                que, num, num_doc, D_,
                r=i)  #,feature=["title","concepts","question"])

            for j in range(num):
                print(
                    ml_metrics.apk(truth[j],
                                   train_rank[j, :100].tolist(),
                                   k=100))
            score = ml_metrics.mapk(truth, train_rank[:, :100].tolist(), k=100)
            #w_file.write("s=%.3f,k1=%.1f, k3=%d, b=%.2f ,r=%d, f="%(score,k1,k3,b,i)+str(feature)+" q="+str(query_feature)+"\n")
            print(i, score)
        w_file.close()

    else:

        train_rank = output(que, num, num_doc, D_, r=r)[:, :100].tolist()
        return train_rank, file
Example #8
0
def mapk_train_vectors(models, labels, k=10):
    """
  Log model MAP@k scores of models.
  
  :params:
    
    models (dict) : dict (name,model)
    labels (dict) : dict mapping item to its recommendations
    k (int) : K in MAP@K
  
  """

    logger.info("Starting evaluation process for MAP@{}...".format(k))

    best_score = 0
    best_model = None

    for model in models:

        predictions = {}

        for doc_id in labels.keys():
            try:
                predictions[doc_id] = [
                    l[0] for l in model.docvecs.most_similar(doc_id, topn=k)
                ]
            except TypeError:
                pass

        mapk = np.mean([
            apk(labels[doc_id], predictions[doc_id], k)
            for doc_id in labels.keys() if doc_id in predictions.keys()
        ])
        logger.info("{0} - MAP@{1} : {2}\n".format(str(model), k, mapk))

        if mapk > best_score:
            best_score = mapk
            best_model = str(model)

    logger.info("Best model with MAP@{0} = {1} : {2} \n ".format(
        k, best_score, best_model))
np.nan_to_num(testStructure)


print "Ranking......."
testStructure = np.transpose(testStructure)
scores = np.matmul(testStructure, invFileTable)
ranks = np.argsort(-scores, axis=1)

numImgs = ranks.shape[0]
mapest = 0; 
trainImgsperFolder = invFileTable.shape[1]/84
testImgsperFolder = numImgs/84

for i in range(numImgs):
	x = (i/testImgsperFolder)*trainImgsperFolder
	mapest = mapest + metrics.apk(range(x, x+trainImgsperFolder), ranks[i,:], 10)

mapest = mapest/numImgs
print mapest
#for i in range(numImgs):
#	print i/testImgsperFolder+1,">>>>>", ranks[i,:10]/trainImgsperFolder+1









 def test_apk(self):
     self.assertAlmostEqual(metrics.apk(range(1,6),[6,4,7,1,2], 2), 0.25)
     self.assertAlmostEqual(metrics.apk(range(1,6),[1,1,1,1,1], 5), 0.2)
     predicted = range(1,21)
     predicted.extend(range(200,600))
     self.assertAlmostEqual(metrics.apk(range(1,100),predicted, 20), 1.0)
Example #11
0
  
  for epoch in range(0,30):
    print(80*'=')
    print ('Epoch [{}/{}]'.format(epoch,30-1))
    #def get_dataloader(train_root_dir,valid_root_dir,train_csv_name,valid_csv_name,num_train_triplets,num_valid_triplets,batch_size,num_workers)
    data_loader,data_size = get_dataloader(train_path,test_path,train_csv_path,test_csv_path,num_train_triplets,num_test_triplets,5,6)
    
    train_valid(model,optimizer,scheduler,epoch,data_loader,data_size)
  print(80*"=")

#call run_experiment
run_experiment(train_path,test_path,train_csv_path,test_csv_path)

sumAP =0.0
for i in range(len(actual_list)):
  sumAP+=ml_metrics.apk(actual_list[i].tolist(),predicted_list[i].tolist(),2)
  print(ml_metrics.apk(actual_list[i].tolist(),predicted_list[i].tolist(),2))
print("mean Average precision ",str(sumAP/len(actual_list)))

sumAP =0.0
for i in range(len(actual_list)):
  sumAP+=ml_metrics.apk(actual_list[i].tolist(),predicted_list[i].tolist(),5)
  print(ml_metrics.apk(actual_list[i].tolist(),predicted_list[i].tolist(),5))
print("mean Average precision ",str(sumAP/len(actual_list)))

num_epochs=20

fig =plt.figure()
for phase in ['train', 'valid']:
  list_epoch    = []
  list_loss     = []
Example #12
0
                ite += 1
        app_score /= len(truth[i])
        score += app_score
    score /= len(predict)
    return score


import pandas as pd


def readans(path):
    df = pd.read_csv(path)
    truth = []
    for i in range(len(df)):
        app = []
        ret = df["retrieved_docs"][i].split(" ")
        for ele in ret:
            app.append(ele)
        truth.append(app)
    return truth


import ml_metrics
import sys
truth = readans("ans_train.csv")
predict = readans("%s.csv" % (sys.argv[1]))
print("overall score", ml_metrics.mapk(truth, predict, k=100))

for i in range(len(truth)):
    print(i, len(truth[i]), ml_metrics.apk(truth[i], predict[i], k=100))
def mean_avg_precision_k(tupla, k=5):
    return ml_metrics.apk(tupla[0], tupla[1], k)
    def main(self):
        logger.info("Start")
        start = time.clock()

        # random users_ids
        user_ids = self.loadUserIds()

        # users --> friends dict
        users_friends, users_friends_set = self.loadUsersFriends(user_ids)
        logger.info("len of users_friends_set:%d"%len(users_friends_set))

        # user --> business
        users_business = self.loadUsersBusiness(users_friends_set)
        logger.info("len of users_business:%d"%len(users_business))

        # rating
        rankRatingDict, predicted_rating  = self.readCvs(INFILE_RANK_RATING)

        # rank business pagerank
        rank_pagerank = self.loadObjJson(self.infile_rank_pagerank)
        predicted_pagerank = [x['business_id'] for x in rank_pagerank]
        rankPagerankDict = self.createDictForWinners(rank_pagerank)

        # rank business indegree
        rank_indegree = self.loadObjJson(self.infile_rank_indegree)
        predicted_indegree = [x['business_id'] for x in rank_indegree]
        rankIndegreeDict = self.createDictForWinners(rank_indegree)

        all_users = users_friends.keys()
        #pp(all_users)
        samples = np.split(np.array(range(TOTAL_USERS)), NUM_SAMPES)
        maps = {'pagerank':[],'indegree':[],'rating':[]}
        winnersAllCountList = list()
        for i,chunk in enumerate(samples):
            logger.info("sample: %d"%i)
            sample = [all_users[x] for x in chunk]
            actual = self.getBusinessFromUsers(sample, users_friends, users_business)
            logger.info("len of actual:%d"%len(actual))

            map_pagerank = ml.apk(actual, predicted_pagerank, k=K_MAP)
            logger.info("map_pagerank[%d]:%6.6f"%(i,map_pagerank))
            maps['pagerank'].append(map_pagerank)
            
            map_indegree = ml.apk(actual, predicted_indegree, k=K_MAP)
            logger.info("map_indegree[%d]:%6.6f"%(i,map_indegree))
            maps['indegree'].append(map_indegree)

            map_rating = ml.apk(actual, predicted_rating, k=K_MAP)
            logger.info("map_rating[%d]:%6.6f"%(i,map_rating))
            maps['rating'].append(map_rating)

            winnersCountDict = self.winnersCountWinners(actual, rankRatingDict,rankPagerankDict,rankIndegreeDict)
            winnersAllCountList.append(winnersCountDict)

        
        #pp(maps)

        maps_stats = {'pagerank':{},'indegree':{},'rating':{}}
        for x in maps.keys():
            maps_stats[x]['mean'] = np.mean(maps[x])
            maps_stats[x]['std'] = np.std(maps[x])
        pp(maps_stats)

        win_stats = {'pagerank':{},'indegree':{},'rating':{}}
        for x in winnersCountDict.keys():
            win_stats[x]['mean'] = np.mean(winnersCountDict[x])
            win_stats[x]['std'] = np.std(winnersCountDict[x])
        pp(win_stats)
    

        # out final rank
        self.saveCsv(OUTFILE_RANK,maps)

        # out final winners
        self.saveWinnerCsv(OUTFILE_WINNER, winnersAllCountList)


        elapsed = (time.clock() - start)
        logger.info("done in %d secs"%int(elapsed))
Example #15
0
def main():

    args = parse_args()
    feature_folders = args.feature_folders
    prob_extensions = args.prob_extensions
    fusion_weights = args.fusion_weights
    split = args.split

    num_features = len(feature_folders)
    assert (num_features >= 1)
    assert (len(prob_extensions) == 1 or len(prob_extensions) == num_features)
    assert (len(fusion_weights) == 1 or len(fusion_weights) == num_features)
    if (len(prob_extensions) == 1):
        prob_extensions = np.repeat(prob_extensions, num_features)
    if (len(fusion_weights) == 1):
        fusion_weights = np.repeat(fusion_weights, num_features)

    print 'computing multi-features video accuracy of Hollywood2 split %d datasets ...' % split

    start_time = datetime.now()

    # getting train/test files information
    info_folder = '/home/tranlaman/Public/data/video/Hollywood2/ClipSets/'
    actions = [
        'AnswerPhone', 'DriveCar', 'Eat', 'FightPerson', 'GetOutCar',
        'HandShake', 'HugPerson', 'Kiss', 'Run', 'SitDown', 'SitUp', 'StandUp'
    ]
    num_actions = len(actions)

    test_info_file = '/home/tranlaman/Public/data/video/Hollywood2/ClipSets/actions_test.txt'
    lines = np.loadtxt(test_info_file,
                       dtype=str,
                       delimiter='  ',
                       comments=None)
    testFiles = lines[:, 0]
    numTestFiles = len(testFiles)
    assert (numTestFiles == 884)

    list_video_gt_per_action = []
    for ind in xrange(len(actions)):
        action = actions[ind]
        action_split_file = '%s/%s_test.txt' % (info_folder, action)
        lines = np.loadtxt(action_split_file,
                           dtype=str,
                           delimiter='  ',
                           comments=None)
        video_gt_per_action = []
        line_index = 0
        for line in lines:
            if int(line[1]) == 1:
                video_gt_per_action.append(line_index)
            line_index += 1
        list_video_gt_per_action.append(video_gt_per_action)

    # get prediction of videos
    list_video_prediction = list()
    list_video_score = list()
    for i in xrange(numTestFiles):
        video = testFiles[i]

        mean_prob = np.zeros((num_actions))
        for k in xrange(0, num_features):
            # load spatial features
            feature_path = os.path.join(feature_folders[k], video,
                                        '*.%s' % prob_extensions[k])
            feature_files = glob.glob(feature_path)
            feature_files.sort()
            if len(feature_files) == 0:
                print 'There is no clips in video %s' % video
                sys.exit(1)

            prob_mat = np.zeros((num_actions, len(feature_files)))
            for ind in xrange(0, len(feature_files)):
                file_path = os.path.join(feature_folders[k], video,
                                         feature_files[ind])
                blob = load_blob_from_binary(file_path)
                prob_mat[:, ind] = blob

            feature_mean_prob = np.mean(prob_mat, axis=1)
            mean_prob = mean_prob + fusion_weights[k] * feature_mean_prob

        mean_prob = softmax(mean_prob)
        prediction = np.argmax(mean_prob)
        conf = mean_prob[prediction]
        list_video_prediction.append(prediction)
        list_video_score.append(conf)

    list_video_prediction = np.array(list_video_prediction)
    list_video_score = np.array(list_video_score)

    # compute average precision
    ap = np.zeros(len(actions))
    list_video_id = np.array(range(numTestFiles))
    for ind in xrange(len(actions)):
        pred_index = (list_video_prediction == ind)
        video_id_pred = list_video_id[pred_index]
        score_pred = list_video_score[pred_index]
        #sorted_idx = score_pred.argsort()[::-1]
        sorted_idx = sorted(range(len(score_pred)),
                            key=lambda k: score_pred[k],
                            reverse=True)
        sorted_video_id_pred = video_id_pred[sorted_idx]

        video_id_gt = list_video_gt_per_action[ind]
        ap[ind] = metrics.apk(video_id_gt, sorted_video_id_pred)

    meanAP = ap.mean()

    current_time = datetime.now()
    run_time = current_time - start_time
    print 'Run-time: ', run_time
    print "Video meanAP of two-stream on test set: {}".format(meanAP)

    # write the results into txt file
    expString = 'Video classification (meanAP) with multiple cnn features.\n'
    fid = open('results.txt', 'a')
    fid.write('\n--------------------------------------------------\n')
    fid.write('%s' % (expString))
    fid.write('Video meanAP on testset split %d of Hollywood2 is %f\n' %
              (split, meanAP))
    fid.write('Test on following feature folders:\n')
    for k in xrange(num_features):
        fid.write('Feature folder: %s\n' % feature_folders[k])
        fid.write('Prob extension: %s\n' % prob_extensions[k])
        fid.write('Fusion weight: %s\n' % fusion_weights[k])
    fid.write('Expriment finished at %s \n' % current_time)
    fid.close()

    # write per class accuracy
    # write the results into txt file
    fid = open('perClassAccuracy.txt', 'a')
    fid.write('\n--------------------------------------------------\n')
    fid.write('%s' % (expString))
    fid.write('Video meanAP on testset split %d of Hollywood2 is %f\n' %
              (split, meanAP))
    fid.write('Test on following feature folders:\n')
    for k in xrange(num_features):
        fid.write('Feature folder: %s\n' % feature_folders[k])
        fid.write('Prob extension: %s\n' % prob_extensions[k])
        fid.write('Fusion weight: %s\n' % fusion_weights[k])
    fid.write('Expriment finished at %s \n' % current_time)
    fid.write('Per class AP\n')
    for ind in xrange(0, num_actions):
        fid.write('Class %d: ... %0.2f\n' % (ind, ap[ind]))
    fid.close()
Example #16
0
def main():

    args = parse_args()
    feature_folders = args.feature_folders
    prob_extensions = args.prob_extensions
    fusion_weights = args.fusion_weights
    split = args.split

    num_features = len(feature_folders)
    assert (num_features >= 1)
    assert (len(prob_extensions) == 1 or len(prob_extensions) == num_features)
    assert (len(fusion_weights) == 1 or len(fusion_weights) == num_features)
    if (len(prob_extensions) == 1):
        prob_extensions = np.repeat(prob_extensions, num_features)
    if (len(fusion_weights) == 1):
        fusion_weights = np.repeat(fusion_weights, num_features)

    print 'computing multi-features video accuracy of Olympic split %d datasets ...' % split

    start_time = datetime.now()

    # getting train/test files information
    info_folder = '/home/tranlaman/Public/data/video/Olympic_Sports_split/test/'
    actions = [
        'basketball_layup', 'bowling', 'clean_and_jerk', 'discus_throw',
        'diving_platform_10m', 'diving_springboard_3m', 'hammer_throw',
        'high_jump', 'javelin_throw', 'long_jump', 'pole_vault', 'shot_put',
        'snatch', 'tennis_serve', 'triple_jump', 'vault'
    ]
    num_actions = len(actions)

    testFiles = []
    list_video_gt_per_action = []
    video_index = 0
    for ind in xrange(len(actions)):
        action = actions[ind]
        action_split_file = '%s/%s.txt' % (info_folder, action)
        lines = np.loadtxt(action_split_file,
                           dtype=str,
                           delimiter='  ',
                           comments=None)
        video_gt_per_action = []
        for line in lines:
            video_gt_per_action.append(video_index)
            video_index += 1
            testFiles.append(os.path.join(action, line))
        list_video_gt_per_action.append(video_gt_per_action)

    numTestFiles = len(testFiles)
    assert (numTestFiles == 134)

    # get prediction of videos
    list_video_prediction = list()
    list_video_score = list()
    for i in xrange(numTestFiles):
        video = testFiles[i]

        mean_prob = np.zeros((num_actions))
        for k in xrange(0, num_features):
            # load spatial features
            feature_path = os.path.join(feature_folders[k], video,
                                        '*.%s' % prob_extensions[k])
            feature_files = glob.glob(feature_path)
            feature_files.sort()
            if len(feature_files) == 0:
                print 'There is no clips in video %s' % video
                sys.exit(1)

            prob_mat = np.zeros((num_actions, len(feature_files)))
            for ind in xrange(0, len(feature_files)):
                file_path = os.path.join(feature_folders[k], video,
                                         feature_files[ind])
                blob = load_blob_from_binary(file_path)
                prob_mat[:, ind] = blob

            feature_mean_prob = np.mean(prob_mat, axis=1)
            mean_prob = mean_prob + fusion_weights[k] * feature_mean_prob

        mean_prob = softmax(mean_prob)
        prediction = np.argmax(mean_prob)
        conf = mean_prob[prediction]
        list_video_prediction.append(prediction)
        list_video_score.append(conf)

    list_video_prediction = np.array(list_video_prediction)
    list_video_score = np.array(list_video_score)

    # compute average precision
    ap = np.zeros(len(actions))
    list_video_id = np.array(range(numTestFiles))
    for ind in xrange(len(actions)):
        pred_index = (list_video_prediction == ind)
        video_id_pred = list_video_id[pred_index]
        score_pred = list_video_score[pred_index]
        sorted_idx = score_pred.argsort()[::-1]
        #sorted_idx = sorted(range(len(score_pred)), key=lambda k: score_pred[k], reverse=True)
        sorted_video_id_pred = video_id_pred[sorted_idx]

        video_id_gt = list_video_gt_per_action[ind]
        ap[ind] = metrics.apk(video_id_gt, sorted_video_id_pred)

    meanAP = ap.mean()

    current_time = datetime.now()
    run_time = current_time - start_time
    print 'Run-time: ', run_time
    print "Video meanAP of two-stream on test set: {}".format(meanAP)

    # write the results into txt file
    expString = 'Video classification (meanAP) with multiple cnn features.\n'
    fid = open('results.txt', 'a')
    fid.write('\n--------------------------------------------------\n')
    fid.write('%s' % (expString))
    fid.write('Video meanAP on testset split %d of Olympic is %f\n' %
              (split, meanAP))
    fid.write('Test on following feature folders:\n')
    for k in xrange(num_features):
        fid.write('Feature folder: %s\n' % feature_folders[k])
        fid.write('Prob extension: %s\n' % prob_extensions[k])
        fid.write('Fusion weight: %s\n' % fusion_weights[k])
    fid.write('Expriment finished at %s \n' % current_time)
    fid.close()

    # write per class accuracy
    # write the results into txt file
    fid = open('perClassAccuracy.txt', 'a')
    fid.write('\n--------------------------------------------------\n')
    fid.write('%s' % (expString))
    fid.write('Video meanAP on testset split %d of Olympic is %f\n' %
              (split, meanAP))
    fid.write('Test on following feature folders:\n')
    for k in xrange(num_features):
        fid.write('Feature folder: %s\n' % feature_folders[k])
        fid.write('Prob extension: %s\n' % prob_extensions[k])
        fid.write('Fusion weight: %s\n' % fusion_weights[k])
    fid.write('Expriment finished at %s \n' % current_time)
    fid.write('Per class AP\n')
    for ind in xrange(0, num_actions):
        fid.write('Class %d: ... %0.2f\n' % (ind, ap[ind]))
    fid.close()
Example #17
0
def evaluateAP(actual, predicted):
    return metrics.apk(actual, predicted, K)
Example #18
0
    def main(self):
        logger.info("Start")
        start = time.clock()

        # random users_ids
        user_ids = self.loadUserIds()

        # users --> friends dict
        users_friends, users_friends_set = self.loadUsersFriends(user_ids)
        logger.info("len of users_friends_set:%d" % len(users_friends_set))

        # user --> business
        users_business = self.loadUsersBusiness(users_friends_set)
        logger.info("len of users_business:%d" % len(users_business))

        # rating
        rankRatingDict, predicted_rating = self.readCvs(INFILE_RANK_RATING)

        # rank business pagerank
        rank_pagerank = self.loadObjJson(self.infile_rank_pagerank)
        predicted_pagerank = [x['business_id'] for x in rank_pagerank]
        rankPagerankDict = self.createDictForWinners(rank_pagerank)

        # rank business indegree
        rank_indegree = self.loadObjJson(self.infile_rank_indegree)
        predicted_indegree = [x['business_id'] for x in rank_indegree]
        rankIndegreeDict = self.createDictForWinners(rank_indegree)

        all_users = users_friends.keys()
        #pp(all_users)
        samples = np.split(np.array(range(TOTAL_USERS)), NUM_SAMPES)
        maps = {'pagerank': [], 'indegree': [], 'rating': []}
        winnersAllCountList = list()
        for i, chunk in enumerate(samples):
            logger.info("sample: %d" % i)
            sample = [all_users[x] for x in chunk]
            actual = self.getBusinessFromUsers(sample, users_friends,
                                               users_business)
            logger.info("len of actual:%d" % len(actual))

            map_pagerank = ml.apk(actual, predicted_pagerank, k=K_MAP)
            logger.info("map_pagerank[%d]:%6.6f" % (i, map_pagerank))
            maps['pagerank'].append(map_pagerank)

            map_indegree = ml.apk(actual, predicted_indegree, k=K_MAP)
            logger.info("map_indegree[%d]:%6.6f" % (i, map_indegree))
            maps['indegree'].append(map_indegree)

            map_rating = ml.apk(actual, predicted_rating, k=K_MAP)
            logger.info("map_rating[%d]:%6.6f" % (i, map_rating))
            maps['rating'].append(map_rating)

            winnersCountDict = self.winnersCountWinners(
                actual, rankRatingDict, rankPagerankDict, rankIndegreeDict)
            winnersAllCountList.append(winnersCountDict)

        #pp(maps)

        maps_stats = {'pagerank': {}, 'indegree': {}, 'rating': {}}
        for x in maps.keys():
            maps_stats[x]['mean'] = np.mean(maps[x])
            maps_stats[x]['std'] = np.std(maps[x])
        pp(maps_stats)

        win_stats = {'pagerank': {}, 'indegree': {}, 'rating': {}}
        for x in winnersCountDict.keys():
            win_stats[x]['mean'] = np.mean(winnersCountDict[x])
            win_stats[x]['std'] = np.std(winnersCountDict[x])
        pp(win_stats)

        # out final rank
        self.saveCsv(OUTFILE_RANK, maps)

        # out final winners
        self.saveWinnerCsv(OUTFILE_WINNER, winnersAllCountList)

        elapsed = (time.clock() - start)
        logger.info("done in %d secs" % int(elapsed))
_Q = 3
### calibrated
### pr=MSD_rec.PredSIc(s2u_tr, _A, _Q, "songs_scores.txt")

### uncalibrated
pr = MSD_rec_shefali.PredSI(s2u_tr, _A, _Q)
pr2 = MSD_rec_shefali.PredSU(
    s2u_tr, 0.3,
    5)  #with only this, 0.0 for 10 users in contrast to 0.33 in SI

print 'Creating recommender..'
cp = MSD_rec_shefali.SReco(songs_ordered)

cp.Add(pr)
cp.Add(pr2)

cp.Gamma = [1.0]

r = cp.RecommendToUsers(users_v[user_min:user_max], u2s_v)
#cp.Valid(uu, u2s_v, u2s_h, n_batch=10)
c = 0
for i, user in enumerate(users_v[user_min:user_max]):
    c = c + metrics.apk(u2s_h[user], r[user])
    pass

print i
t = c / (i + 1)
print t

MSD_util.save_recommendations(r, "kaggle_songs.txt", osfile)
    #print j
    if user in utsh:
        flag = 0
        #       print user
        rec = []
        l1 = uts[user]
        l2 = map(int, l1)

        #print l2[1]
        for i in l2:
            if i < 10000:
                flag = 1
                #print indices[i]
                rec.extend(indices[i])
            #rec.extend(indices[1])
#        print rec
#    print utsh[user]
        hid = map(int, utsh[user])
        if flag > 0:
            count = count + 1
        #print hid
        #print j
        c = c + metrics.apk(utsh[user], rec, 10000)
#print j
#print count
#print c
relevant = c / (count)
print relevant
allu = c / j
#print allu
Example #21
0
    def _get_groupped_action(self, **kwargs):
        """
        Groups data by `group_by_field` field and calculates mean average
        precision.
        Note: `group_by_field` should be specified in request parameters.
        """
        from ml_metrics import apk
        import numpy as np
        from operator import itemgetter
        logging.info('Start request for calculating MAP')

        group_by_field, count = self._parse_map_params()
        if not group_by_field:
            return odesk_error_response(400, ERR_INVALID_DATA,
                                        'field parameter is required')

        res = []
        avps = []

        groups = TestExample.get_grouped(
            field=group_by_field,
            model_id=kwargs.get('model_id'),
            test_result_id=kwargs.get('test_result_id')
        )

        import sklearn.metrics as sk_metrics
        import numpy
        if len(groups) < 1:
            logging.error('Can not group')
            return odesk_error_response(400, ERR_INVALID_DATA,
                                        'Can not group')
        if 'prob' not in groups[0]['list'][0]:
            logging.error('Examples do not contain probabilities')
            return odesk_error_response(400, ERR_INVALID_DATA, 'Examples do \
not contain probabilities')
        if not isinstance(groups[0]['list'][0]['prob'], list):
            logging.error('Examples do not contain probabilities')
            return odesk_error_response(400, ERR_INVALID_DATA, 'Examples do \
not contain probabilities')

        if groups[0]['list'][0]['label'] in ("True", "False"):
            def transform(x):
                return int(bool(x))
        elif groups[0]['list'][0]['label'] in ("0", "1"):
            def transform(x):
                return int(x)
        else:
            logging.error('Type of labels do not support')
            return odesk_error_response(400, ERR_INVALID_DATA,
                                        'Type of labels do not support')
        logging.info('Calculating avps for groups')
        calc_average = True
        for group in groups:
            group_list = group['list']

            labels = [transform(item['label']) for item in group_list]
            pred_labels = [transform(item['pred']) for item in group_list]
            probs = [item['prob'][1] for item in group_list]
            if len(labels) > 1:
                labels = numpy.array(labels)
                probs = numpy.array(probs)
                try:
                    precision, recall, thresholds = \
                        sk_metrics.precision_recall_curve(labels, probs)
                    avp = sk_metrics.auc(recall[:count], precision[:count])
                except:
                    avp = apk(labels, pred_labels, count)
            else:
                avp = apk(labels, pred_labels, count)
            if math.isnan(avp):
                calc_average = False
                avp = "Can't be calculated"
            avps.append(avp)
            res.append({'group_by_field': group[group_by_field],
                        'count': len(group_list),
                        'avp': avp})

        res = sorted(res, key=itemgetter("count"), reverse=True)[:100]
        logging.info('Calculating map')
        mavp = np.mean(avps) if calc_average else "N/A"
        context = {self.list_key: {'items': res},
                   'field_name': group_by_field,
                   'mavp': mavp}
        logging.info('End request for calculating MAP')
        return self._render(context)
Example #22
0
        listOfReleventList = []
        listOfRetrievedList = []

        bar = progressbar.ProgressBar(redirect_stdout=True)
        for i in bar(range(len(queryList))):
            queryImage = np.loadtxt('./' + queryDir + '/' + str(queryList[i]) +
                                    '.vc',
                                    delimiter=',')
            relevantList = buildRelevantList(queryDir, queryList[i])
            nearestNeighbours = imageUtilities.initList(len(relevantList))
            distance = Parallel(n_jobs=-1)(
                delayed(worker)(queryImage, referenceList[j], databaseDir)
                for j in range(len(referenceList)))
            nearestNeighbours = sorted(
                distance, key=itemgetter('distance'))[:len(relevantList)]
            retrievedList = []
            for k in nearestNeighbours:
                retrievedList.append(int(k['filename']))
            apk = metrics.apk(actual=relevantList, predicted=retrievedList)
            listOfReleventList.append(relevantList)
            listOfRetrievedList.append(retrievedList)
        mapk = metrics.mapk(actual=listOfReleventList,
                            predicted=listOfRetrievedList)
        print(mapk)
        text = '\nMAPK for ' + queryDir + ' against ' + \
            databaseDir + ' is ' + str(mapk)
        file = open("mapk.txt", "a")
        file.write(text)
        file.close()
Example #23
0
 def average_precision(self):
     from ml_metrics import apk
     if not hasattr(self, '_apk'):
         self._apk = apk(self._labels, self._preds)
     return self._apk
Example #24
0
def mAP_(actual, predicted):
  AP = [ml_metrics.apk(actual=actual[i], predicted=predicted[i], k=5) if len(actual[i]) else 0 for i in range(len(predicted)) ]
  mAP = np.mean(AP)
  std = np.std(AP)/np.sqrt(len(AP))
  #print(f'Error: {mAP:.4f} +/- {3*std:.4f}')
  return mAP
Example #25
0
        for song in songs_ordered: 
            if len(songs_to_recommend) >= 500: 
                break 
            if not song in user_to_songs[user]: 
                songs_to_recommend.append(song_to_index[song]) 
                pass
        
        myDic[user] = songs_to_recommend
        print user
        print myDic[1]
        pass
    pass

c = 0
for i, user in enumerate(canonical_users):
    c = c+metrics.apk(uts[user],myDic[user])
    pass 

t = c/i
print t








//2nd file starts from here

with open ("song_unsort.txt", "w") as f2:
Example #26
0
import ml_metrics as metrics

# ##I am going to show how order matters in MAP@K when there is only 1 answer.
#
# ##This experiment is done by calculating AP@K, which gives 1 value. MAP@K is the average of AP@K.
#

# In[ ]:

actual = [1]

predicted = [1, 2, 3, 4, 5]

print('Answer=', actual, 'predicted=', predicted)
print('AP@5 =', metrics.apk(actual, predicted, 5))

predicted = [2, 1, 3, 4, 5]
print('Answer=', actual, 'predicted=', predicted)
print('AP@5 =', metrics.apk(actual, predicted, 5))

predicted = [3, 2, 1, 4, 5]
print('Answer=', actual, 'predicted=', predicted)
print('AP@5 =', metrics.apk(actual, predicted, 5))

predicted = [4, 2, 3, 1, 5]
print('Answer=', actual, 'predicted=', predicted)
print('AP@5 =', metrics.apk(actual, predicted, 5))

predicted = [4, 2, 3, 5, 1]
print('Answer=', actual, 'predicted=', predicted)
Example #27
0
def stop(x, y, z, a):
    if z > len(y):
        sys.exit()
    else:
        return metrics.apk(x, y, a)
Example #28
0
def get_cluster_recommendations_and_calculate_metrics(
    estimator,
    user_artists_df: pd.DataFrame(columns=["user_id", "artists"]),
    train_matrix: csr_matrix,
    test_matrix: csr_matrix,
    train: NDArray[(Any, ), int],
    test: NDArray[(Any, ), int],
) -> Dict[int, float]:
    """
    Обучение модели кластеризации, поиск рекомендаций, оценка качества рекомендаций.
    """
    estimator = estimator.fit(train_matrix.toarray())
    train_df = pd.DataFrame([], columns=["artists", "clusters"])
    train_df["artists"] = train
    train_df["clusters"] = estimator.labels_
    train_df = train_df.groupby("clusters").agg(list)

    print("Основные статистики кластеризации:")
    print("Количество исполнителей: ", len(estimator.labels_))
    print("Количество кластеров:", len(np.unique(estimator.labels_)))
    counts_clusters = np.unique(estimator.labels_, return_counts=True)[1]
    print("Наибольший размер кластера:", np.max(counts_clusters))
    print("Гистограмма распределения количества кластеров по их величине")
    hist = np.histogram(counts_clusters,
                        bins=[1, 2, 3, 4, 5, 10, 50, 100, 500, 1000])
    print("Деления: \n", hist[1])
    print("Частоты: \n", hist[0])

    test_df = pd.DataFrame([], columns=["artists", "clusters"])
    test_df["artists"] = test
    test_df["clusters"] = estimator.predict(test_matrix.toarray())
    test_df = test_df.groupby("clusters").agg(list).reset_index()

    metrics = {}
    for k in [1, 5, 10, 20]:
        metrics[k] = 0

    for cluster, test_artists in zip(test_df["clusters"], test_df["artists"]):
        # Формирование рекомендации как топ-21 исполнителей в соответствующем кластере
        if cluster in train_df.index:
            recommend = get_topk_artists_from_idxs(21, user_artists_df,
                                                   train_df.loc[cluster])
            for artist_id in test_artists:
                # Поиск списков исполнителей, которых слушают те же пользователи,
                # что и artist_id:
                users = np.unique(user_artists_df.loc[
                    user_artists_df["artists"] == artist_id, "user_id"].values)
                similar_artists = (
                    user_artists_df.loc[user_artists_df["user_id"].isin(users),
                                        "artists"].drop_duplicates().values)
                similar_artists = similar_artists[similar_artists != artist_id]

                # Подсчет метрик качества
                for key in metrics.keys():
                    metrics[key] += ml_metrics.apk(
                        actual=list(similar_artists),
                        predicted=list(recommend[:key]),
                        k=key,
                    )

    for key in metrics.keys():
        metrics[key] /= len(test)
        print("MAP@{}: ".format(key), metrics[key])
    return metrics