Ejemplo n.º 1
0
def testBeyondAccurracyMetrics(train_filename, eval_item_filename, user_means_filename):
    
    logging.info('testing beyond-accuracy topNLists with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename))
    
    train_data = trainData.TrainData(train_filename, user_means_filename)
    _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
    
    with open(eval_item_filename,'rb') as eval_file:
        for line in eval_file:
            data = line.split('\t')
            user_id = data[0]
            user_index = train_data.getUserIndex(user_id)
            
            if len(train_data.getUserProfileByIndex(user_index)) < 1:
                continue
            
            ground_truth_items = data[1].split(',')
            random_unrated_items = data[2].rstrip('\n').split(',')
             
            evaluation_item_ids = ground_truth_items + random_unrated_items
             
            rec_list_szie = config.RECOMMENDATION_LIST_SIZE * config.DIVERSIFICATION_CANDIDATES_FACTOR
            
#             predictions = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids)
#             top_recs = topNLists.getTopNList(predictions, rec_list_szie)
            
#             predictions_ib = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
#             top_recs_ib = topNLists.getTopNList(predictions_ib, rec_list_szie)
            
#             predictions = library_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
#             top_recs = topNLists.getTopNList(predictions, rec_list_szie, evaluation_item_ids)
            
            predictions_ub = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs_ub = topNLists.getTopNList(predictions_ub, rec_list_szie)
            
#             print 'user',user_id
            
#             print top_recs_ib, top_recs_ub
            
#             rare = train_data.getPopularityInfo()[:10]
#             pop = train_data.getPopularityInfo()[-10:]
            
            top_recs = top_recs_ub
            print 'diversity_ratings',diversity.getListDiversity(train_data, top_recs, 'div_r')
            print 'diversity_content',diversity.getListDiversity(train_data, top_recs, 'div_c')
            print 'content',serendipity.getListSerendipity(train_data, user_index, top_recs, 'sur_c')
            
#             print 'rare cooccurrence',serendipity.getListSerendipity(train_data, user_index, rare, 'sur_r')
#             print 'rare cooccurrence normalized',serendipity.getListSerendipity(train_data, user_index, rare, 'sur_r_n')
#             
#             print 'pop cooccurrence',serendipity.getListSerendipity(train_data, user_index, pop, 'sur_r')
#             print 'pop cooccurrence normalized',serendipity.getListSerendipity(train_data, user_index, pop, 'sur_r_n')
#             
#             print 'rare novelty',novelty.getListNovelty(train_data, rare)
#             
#             print 'pop novelty',novelty.getListNovelty(train_data, pop)
            
            print '------------------------------'
Ejemplo n.º 2
0
def testItemContentLabels(train_filename, eval_item_filename, user_means_filename):
    
    logging.info('testing if all items have content labels with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename))
    
    train_data = trainData.TrainData(train_filename, user_means_filename)
    _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
    
    for item_index in train_data._col_indices.values():
        item_id = train_data.getItemId(item_index)
        
        
        if item_id not in config.ITEM_DATA:
            print 'index',item_index,'id',item_id
            print len(config.ITEM_DATA)
        
        
        assert item_id in config.ITEM_DATA
    
    logging.info('done! tested {0} items. Average num of content labels is {1}'.format( len(train_data._col_indices), np.mean([len(item_dict['labels']) for item_dict in config.ITEM_DATA.values()]) ))
    
    with open(eval_item_filename,'rb') as eval_file:
        for line in eval_file:
            data = line.split('\t')
            user_id = data[0]
            user_index = train_data.getUserIndex(user_id)
            
            if len(train_data.getUserProfileByIndex(user_index)) < 1:
                continue
            
            ground_truth_items = data[1].split(',')
            random_unrated_items = data[2].rstrip('\n').split(',')
             
            evaluation_item_ids = ground_truth_items + random_unrated_items
            rec_list_szie = config.RECOMMENDATION_LIST_SIZE * config.DIVERSIFICATION_CANDIDATES_FACTOR
            predictions = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids)
            top_recs = topNLists.getTopNList(predictions, rec_list_szie)
            
            print 'diversity_content',diversity.getListDiversity(train_data, top_recs, 'div_c')
            
            exit()
Ejemplo n.º 3
0
def testPredictionMethods(train_filename, eval_item_filename, user_means_filename):
    '''
    compare predictions generated by the different approaches
    computes pairwise list overlap and average recall for each method
    '''
    
    logging.info('testing predictions with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename))
    
    
    mrec_train_data = load_fast_sparse_matrix('tsv', train_filename)
    
    mrec_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE)
    mrec_recommender.fit(mrec_train_data)
    
    warp_recommender = WARPMFRecommender(d=50, gamma=0.01, C=100.0)
    warp_recommender.fit(mrec_train_data.X)
    
    train_data = trainData.TrainData(train_filename, user_means_filename)
    _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE)
    
    recalls = {}
    overlaps = {}
    top_recs = {}
    user_counter = 0.0
    methods = ['mrec', 'warp', 'mf', 'ub_classic', 'ib_classic', 'ub_damping', 'ib_damping', 'ub_non', 'ib_non']
    
    with open(eval_item_filename,'rb') as eval_file:
        for line in eval_file:
            data = line.split('\t')
            user_id = data[0]
            ground_truth_items = data[1].split(',')
            random_unrated_items = data[2].rstrip('\n').split(',')
            
            evaluation_item_ids = ground_truth_items + random_unrated_items
            
            # for each prediction method, compute topN recommendations once per user
            predictions1 = mrec_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
            top_recs['mrec'] = topNLists.getTopNList(predictions1, evaluation_item_ids=evaluation_item_ids)
            
            predictions2 = warp_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True)
            top_recs['warp'] = topNLists.getTopNList(predictions2, evaluation_item_ids=evaluation_item_ids)
            
            predictions3 = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids)
            top_recs['mf'] = topNLists.getTopNList(predictions3)
            
            predictions4 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'classic')
            top_recs['ub_classic'] = topNLists.getTopNList(predictions4)
            
            predictions5 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'classic')
            top_recs['ib_classic'] = topNLists.getTopNList(predictions5)
            
            predictions6 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
            top_recs['ub_damping'] = topNLists.getTopNList(predictions6)
            
            predictions7 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
            top_recs['ib_damping'] = topNLists.getTopNList(predictions7)
            
            predictions8 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs['ub_non'] = topNLists.getTopNList(predictions8)
            
            predictions9 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
            top_recs['ib_non'] = topNLists.getTopNList(predictions9)
            
            # then, use the computed topN lists to update recall and overlap values
            for method1 in methods:
                if method1 in recalls:
                    recalls[method1] += topNLists.getRecall(ground_truth_items, top_recs[method1])
                else:
                    recalls[method1] = topNLists.getRecall(ground_truth_items, top_recs[method1])
                
                for method2 in methods:
                    dict_key = method1 + '_' + method2
                    if dict_key in overlaps:
                        overlaps[dict_key] += topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2])
                    else:
                        overlaps[dict_key] = topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2])
            
            user_counter += 1.0
            logging.info('Tested user {0}. Current recalls: {1}. Current overlaps: {2}'.\
                         format(user_id, [(k, v/user_counter) for k,v in recalls.items()], [(k, v/user_counter) for k,v in overlaps.items()]))
            
    return recalls, overlaps
Ejemplo n.º 4
0
def testToyExample():
    '''
    TEST data
    Users: Jack, u2, u3, Zak, Me
    Movies: A, B, C, D, E
    
            A    B    C    D    E
    Jack    5    1    3    4    3
    u2           4    1
    u3           2              5
    Zak     4    1    4    5    4
    Me      5         4         3
    '''
    centered_file_path, user_means_file_path = dataPreprocessing._MeanCenter('../splits/TEST')
    
    data_m = trainData.TrainData(centered_file_path, user_means_file_path)
    
    # test the rating matrix
    assert data_m.rating_matrix.shape == (5,5)
    assert data_m.getItemNeighboursByIndex(data_m.getItemIndex('C'), None)[0][0] == data_m.getItemIndex('D')
    assert data_m.getNumOfItemRatersByIndex(data_m.getItemIndex('A')) == 3
    assert all(data_m.getItemProfileByIndex(data_m.getItemIndex('D')) == [data_m.getUserIndex('Jack'),data_m.getUserIndex('Zak')])
    assert data_m.getPopularityInfo()[0][0] == 'D'
    
    assert data_m.getUserNeighboursByIndex(data_m.getUserIndex('Jack'), None)[0][0] == data_m.getUserIndex('Zak')
    assert all(data_m.getUserProfileByIndex(data_m.getUserIndex('u2')) == [data_m.getItemIndex('B'),data_m.getItemIndex('C')])
    
    
    # test the user-user matrix
    jack = [5.0, 1.0, 3.0, 4.0, 3.0]
    mean_jack = np.mean([i for i in jack if i > 0.0])
    
    zak = [4.0, 1.0, 4.0, 5.0, 4.0]
    mean_zak = np.mean([i for i in zak if i > 0.0])
    
    me = [5.0, 0.0, 4.0, 0.0, 3.0]
    mean_me = np.mean([i for i in me if i > 0.0])
    
    for i in range(len(jack)):
        if jack[i] > 0.0:
            jack[i] -= mean_jack
    for i in range(len(zak)):
        if zak[i] > 0.0:
            zak[i] -= mean_zak
    for i in range(len(me)):
        if me[i] > 0.0:
            me[i] -= mean_me
    
    my_sim = data_m.user_similarity_matrix[data_m.getUserIndex('Me'), data_m.getUserIndex('Jack')] 
    ground_truth_sim = 1 - spatial.distance.cosine(me, jack)
    assert abs(ground_truth_sim - my_sim) < 0.001
    
    
    # test the item-item matrix
    a = [5.0-mean_jack, 0.0, 0.0, 4.0-mean_zak, 5.0-mean_me]
    d = [4.0-mean_jack, 0.0, 0.0, 5.0-mean_zak, 0.0]
    
    my_sim = data_m.item_similarity_matrix[data_m.getItemIndex('A'), data_m.getItemIndex('D')] 
    ground_truth_sim = 1 - spatial.distance.cosine(a, d)
    assert abs(ground_truth_sim - my_sim) < 0.001
    
    
    # test recommendation generation
#     user_id = 'Me'
#     evaluation_item_ids = ['A', 'B', 'C', 'D', 'E']
#     
#     _, _, Q = sparsesvd(data_m.rating_matrix.tocsc(), 2)
#     mf = data_m.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids)
#     
#     ub_classic = data_m.getUserBasedRecommendations(user_id, evaluation_item_ids, 'classic', verbose=True)
#     ub_damping = data_m.getUserBasedRecommendations(user_id, evaluation_item_ids, 'self_damping', verbose=True)
#     ub_non = data_m.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized', verbose=True)
#     
#     ib_classic = data_m.getItemBasedRecommendations(user_id, evaluation_item_ids, 'classic')
#     ib_damping = data_m.getItemBasedRecommendations(user_id, evaluation_item_ids, 'self_damping')
#     ib_non = data_m.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized')
#     
#     print mf
#     print '---------------------'
#     print ub_classic
#     print ub_damping
#     print ub_non
#     print '---------------------'
#     print ib_classic
#     print ib_damping
#     print ib_non
    
    # test diversity frameworkMetrics
    config.MOVIES_OR_MUSIC = 'movies'
    config.ITEM_DATA = {'A':{'labels':['horror']}, 'B':{'labels':['drama']}, 'C':{'labels':['drama']}, 'D':{'labels':['horror', 'comedy']}, 'E':{'labels':['drama']}}
    item_list = [('A',1.0),('B',1.0),('C',1.0),('D',0.5)]
    
    ground_truth_div = ((1.0 - data_m.item_similarity_matrix[data_m.getItemIndex('A'), data_m.getItemIndex('B')]) / 2.0 + \
                        (1.0 - data_m.item_similarity_matrix[data_m.getItemIndex('A'), data_m.getItemIndex('C')]) / 2.0 + \
                        (1.0 - data_m.item_similarity_matrix[data_m.getItemIndex('A'), data_m.getItemIndex('D')]) / 2.0 + \
                        (1.0 - data_m.item_similarity_matrix[data_m.getItemIndex('B'), data_m.getItemIndex('C')]) / 2.0 + \
                        (1.0 - data_m.item_similarity_matrix[data_m.getItemIndex('B'), data_m.getItemIndex('D')]) / 2.0 + \
                        (1.0 - data_m.item_similarity_matrix[data_m.getItemIndex('C'), data_m.getItemIndex('D')]) / 2.0) / 6.0
    
    assert diversity.getListDiversity(data_m, item_list, 'div_r') == ground_truth_div
    assert diversity.getListDiversity(data_m, item_list, 'div_c') == 4.5 / 6
    
    # test serendipity frameworkMetrics
    assert serendipity.getListSerendipity(data_m, data_m.getUserIndex('Me'), [('A',0.5)], 'coocc') == 0.0
    assert serendipity.getListSerendipity(data_m, data_m.getUserIndex('Me'), [('D',0.5)], 'cont') == 0.5
    assert abs(serendipity.getListSerendipity(data_m, data_m.getUserIndex('Me'), item_list, 'coocc') - 0.306732842163) < 0.001
    
    # test novelty frameworkMetrics
    assert novelty._getItemNovelty(data_m, 'B') == novelty._getItemNovelty(data_m, 'E')
Ejemplo n.º 5
0
    #config.DISCOUNT_ACCURACY_BY_BETTER_EXPLANATIONS = 1
    
    if not os.path.exists(config.SPLIT_DIR):
        os.makedirs(config.SPLIT_DIR)
    if not os.path.exists(config.RESULT_DIR):
        os.makedirs(config.RESULT_DIR)
    
    filenames = dataPreprocessing.loadData(mode='survey')
#     testDataSplitFiles()
    
    for iteration, (train_filename, test_filename, user_means_filename, eval_item_filename, opinion_filename) in enumerate(filenames, 1):
        
        if options.iteration and (iteration != options.iteration):
            continue
        
        train_data = trainData.TrainData(train_filename, user_means_filename)
        
        config.ITEM_OPINIONS.clear()
        with open(opinion_filename, 'rb') as opinion_file:
            config.ITEM_OPINIONS = pickle.load(opinion_file)
        
#         testDataCentering(train_filename, user_means_filename)
#         testEvaluationItemFile(train_filename, test_filename, eval_item_filename)
#         recalls, overlaps = testPredictionMethods(train_filename, eval_item_filename, user_means_filename)
#         testItemContentLabels(train_filename, eval_item_filename, user_means_filename)
#         testBeyondAccurracyMetrics(train_filename, eval_item_filename, user_means_filename)
#         testExplanations(train_data, test_filename, mean_center=False, n_users=5, n_recs=5, verb=False)
#         explanationMetrics._getRuleDiscountedAccuracy(train_data, 0.58, [('22', 'like')], ('50', 'like'), set(config.ITEM_OPINIONS['22', 'like']), True)
#         explanationMetrics._getRuleDiscountedAccuracy(train_data, 0.46, [('1617', 'like')], ('50', 'like'), set(config.ITEM_OPINIONS['1617', 'like']), True)
        generateExplanationSurvey(train_data, test_filename, iteration, verb=False)