Esempio n. 1
0
    item_popularity = np.sort(item_popularity)
    pyplot.plot(item_popularity, 'ro')
    pyplot.ylabel('Num Interactions ')
    pyplot.xlabel('Sorted Item')
    pyplot.show()

    user_activity = np.ediff1d(URM_all.indptr)
    user_activity = np.sort(user_activity)

    pyplot.plot(user_activity, 'ro')
    pyplot.ylabel('Num Interactions ')
    pyplot.xlabel('Sorted User')
    pyplot.show()'''

    #np.random.seed(1234)
    URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.90)
    ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.9)
    evaluator_validation = EvaluatorHoldout(URM_test,
                                            cutoff_list=[10],
                                            exclude_seen=True)

    URM_ICM_train = sps.vstack([URM_train, ICM_all.T])
    URM_ICM_train = URM_ICM_train.tocsr()
    URM_ICM_train2 = sps.hstack([ICM_all, URM_train.T])
    URM_ICM_train2 = URM_ICM_train2.tocsr()

    earlystopping_keywargs = {
        "validation_every_n": 10,
        "stop_on_validation": True,
        "evaluator_object": evaluator_validation,
        "lower_validations_allowed": 5,
Esempio n. 2
0
p3Param = {
    'topK': 64,
    'alpha': 0.5626527178823623,
    'min_rating': 0.4999280105627021,
    'implicit': [False, False, False]
}

alpha1 = 0.4
alpha2 = 0.5399999999999999
alpha3 = 0.06000000000000005

print(
    "***************************Ensure the parameter is good**********************"
)

URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8)
itemCF_recommender = ItemKNNCFRecommender(URM_train)
itemCF_recommender.fit(**itemCFParam)
slim_recommender = SLIM_BPR_Cython(URM_train, recompile_cython=False)
slim_recommender.fit(**slimParam)
p3_recommender = P3alphaRecommender(URM_train)
p3_recommender.fit(**p3Param)

recommender1 = SimilarityHybridRecommender(URM_train,
                                           itemCF_recommender.W_sparse,
                                           slim_recommender.W_sparse,
                                           p3_recommender.W_sparse)
recommender1.fit(topK=100, alpha1=alpha1, alpha2=alpha2, alpha3=alpha3)

evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10])
eval_res = evaluator_validation.evaluateRecommender(recommender1)
Esempio n. 3
0
ICM_asset_path = "data/data_ICM_asset.csv"
ICM_asset_file = open(ICM_asset_path, 'r')

ICM_price_path = "data/data_ICM_price.csv"
ICM_price_file = open(ICM_price_path, 'r')

ICM_sub_class = "data/data_ICM_sub_class.csv"
ICM_sub_class_file = open(ICM_sub_class, 'r')

ICM_all, n_items, n_features = get_ICM(ICM_sub_class, URM_all)
print("Number of items is ", str(n_items))
print("n_features is ", str(n_features))

from Notebooks_utils.data_splitter import train_test_holdout

URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8)
URM_train, URM_validation = train_test_holdout(URM_train, train_perc=0.9)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

recommender_class = ItemKNNCFRecommender

parameterSearch = SearchBayesianSkopt(
    recommender_class,
    evaluator_validation=evaluator_validation,
    evaluator_test=evaluator_test)

output_folder_path = "result_experiments/"

# If directory does not exist, create
Esempio n. 4
0
def crossval(URM_all, ICM_all, target_ids, k):

    seed = 1234 + k  #+ int(time.time())
    np.random.seed(seed)
    tp = 0.75
    URM_train, URM_test = train_test_holdout(URM_all, train_perc=tp)
    ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.95)
    evaluator_validation = EvaluatorHoldout(URM_test,
                                            cutoff_list=[10],
                                            exclude_seen=True)
    args = {}

    p3alpha = P3alphaRecommender.P3alphaRecommender(URM_train)
    try:
        args = {
            "topK": 991,
            "alpha": 0.4705816992313091,
            "normalize_similarity": False
        }
        p3alpha.load_model(
            'SavedModels\\', p3alpha.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp) + ".zip")
    except:
        print("Saved model not found. Fitting a new one...")
        p3alpha.fit(**args)
        p3alpha.save_model(
            'SavedModels\\', p3alpha.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp))

    rp3beta = RP3betaRecommender.RP3betaRecommender(URM_train)
    try:
        args = {
            "topK": 991,
            "alpha": 0.4705816992313091,
            "beta": 0.37,
            "normalize_similarity": False
        }
        rp3beta.load_model(
            'SavedModels\\', rp3beta.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp) + ".zip")
    except:
        print("Saved model not found. Fitting a new one...")
        rp3beta.fit(**args)
        rp3beta.save_model(
            'SavedModels\\', rp3beta.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp))

    itemKNNCF = ItemKNNCFRecommender.ItemKNNCFRecommender(URM_train)
    try:
        args = {
            "topK": 1000,
            "shrink": 732,
            "similarity": "cosine",
            "normalize": True,
            "feature_weighting": "TF-IDF"
        }
        itemKNNCF.load_model(
            'SavedModels\\', itemKNNCF.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp) + ".zip")
    except:
        print("Saved model not found. Fitting a new one...")
        itemKNNCF.fit(**args)
        itemKNNCF.save_model(
            'SavedModels\\', itemKNNCF.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp))

    userKNNCF = UserKNNCFRecommender.UserKNNCFRecommender(URM_train)
    try:
        args = {
            "topK": 131,
            "shrink": 2,
            "similarity": "cosine",
            "normalize": True
        }
        userKNNCF.load_model(
            'SavedModels\\', userKNNCF.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp) + ".zip")
    except:
        print("Saved model not found. Fitting a new one...")
        userKNNCF.fit(**args)
        userKNNCF.save_model(
            'SavedModels\\', userKNNCF.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp))

    itemKNNCBF = ItemKNNCBFRecommender.ItemKNNCBFRecommender(
        URM_train, ICM_all)
    try:
        args = {
            "topK": 700,
            "shrink": 100,
            "similarity": 'jaccard',
            "normalize": True,
            "feature_weighting": "TF-IDF"
        }
        itemKNNCBF.load_model(
            'SavedModels\\', itemKNNCBF.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp) + ".zip")
    except:
        print("Saved model not found. Fitting a new one...")
        itemKNNCBF.fit(**args)
        itemKNNCBF.save_model(
            'SavedModels\\', itemKNNCBF.RECOMMENDER_NAME + toFileName(args) +
            ",s=" + str(seed) + ",tp=" + str(tp))

    #cfw = CFW_D_Similarity_Linalg.CFW_D_Similarity_Linalg(URM_train, ICM_train, itemKNNCF.W_sparse)
    #cfw.fit(show_max_performance=False, logFile=None, loss_tolerance=1e-6,
    #        iteration_limit=500000, damp_coeff=0.5, topK=900, add_zeros_quota=0.5, normalize_similarity=True)

    # Need to change bpr code to avoid memory error, useless since it's bad
    # bpr = SLIM_BPR_Cython(URM_train, recompile_cython=False)
    # bpr.fit(**{"topK": 1000, "epochs": 130, "symmetric": False, "sgd_mode": "adagrad", "lambda_i": 1e-05,
    #          "lambda_j": 0.01, "learning_rate": 0.0001})

    pureSVD = PureSVDRecommender.PureSVDRecommender(URM_train)
    pureSVD.fit(num_factors=1000)

    hyb = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, p3alpha, itemKNNCBF)
    hyb.fit(alpha=0.5)

    # Kaggle MAP 0.084 rp3beta, itemKNNCBF
    hyb2 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, pureSVD, itemKNNCBF)
    hyb2.fit(alpha=0.5)

    # Kaggle MAP 0.08667
    hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, hyb, hyb2)
    hyb3.fit(alpha=0.5)

    #hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, p3alpha, userKNNCF)
    #hyb3.fit(alpha=0.5)

    hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(
        URM_train, ICM_all)
    # Kaggle MAP 0.08856
    try:
        # Full values: "alpha_P": 0.4108657561671193, "alpha": 0.6290871066510789
        args = {
            "topK_P": 903,
            "alpha_P": 0.41086575,
            "normalize_similarity_P": False,
            "topK": 448,
            "shrink": 20,
            "similarity": "tversky",
            "normalize": True,
            "alpha": 0.6290871,
            "feature_weighting": "TF-IDF"
        }
        hyb5.load_model(
            'SavedModels\\', hyb5.RECOMMENDER_NAME + toFileName(args) + ",s=" +
            str(seed) + ",tp=" + str(tp) + ".zip")
    except:
        print("Saved model not found. Fitting a new one...")
        hyb5.fit(**args)
        hyb5.save_model(
            'SavedModels\\', hyb5.RECOMMENDER_NAME + toFileName(args) + ",s=" +
            str(seed) + ",tp=" + str(tp))

    # hyb5.fit(**{"topK_P": 1000, "alpha_P": 0.5432601071314623, "normalize_similarity_P": True, "topK": 620, "shrink": 0,
    #             "similarity": "tversky", "normalize": False, "alpha": 0.5707347522847057, "feature_weighting": "BM25"})

    # Kaggle MAP 0.086 :(
    #hyb6 = ScoresHybrid3Recommender.ScoresHybrid3Recommender(URM_train, rp3beta, itemKNNCBF, p3alpha)
    #hyb6.fit()

    hyb6 = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF(
        URM_train, ICM_all)
    try:
        # Full values: "alpha_P": 0.5081918012150626, "alpha": 0.44740093610861603
        args = {
            "topK_P": 623,
            "alpha_P": 0.5081918,
            "normalize_similarity_P": False,
            "topK": 1000,
            "shrink": 1000,
            "similarity": "tversky",
            "normalize": True,
            "alpha": 0.4474009,
            "beta_P": 0.0,
            "feature_weighting": "TF-IDF"
        }
        hyb6.load_model(
            'SavedModels\\', hyb6.RECOMMENDER_NAME + toFileName(args) + ",s=" +
            str(seed) + ",tp=" + str(tp) + ".zip")
    except:
        print("Saved model not found. Fitting a new one...")
        hyb6.fit(**args)
        hyb6.save_model(
            'SavedModels\\', hyb6.RECOMMENDER_NAME + toFileName(args) + ",s=" +
            str(seed) + ",tp=" + str(tp))

    v0 = evaluator_validation.evaluateRecommender(hyb)[0][10]["MAP"]
    v1 = evaluator_validation.evaluateRecommender(hyb2)[0][10]["MAP"]
    v2 = evaluator_validation.evaluateRecommender(hyb3)[0][10]["MAP"]
    v3 = evaluator_validation.evaluateRecommender(hyb5)[0][10]["MAP"]
    v4 = evaluator_validation.evaluateRecommender(hyb6)[0][10]["MAP"]

    #item_list = hyb3.recommend(target_ids, cutoff=10)
    #CreateCSV.create_csv(target_ids, item_list, 'ItemKNNCBF__RP3beta')

    return [v0, v1, v2, v3, v4]
Esempio n. 5
0
def gethyb():
    start_time = time.time()

    URM_all, user_id_unique, item_id_unique = RecSys2020Reader.load_urm()
    ICM_all = RecSys2020Reader.load_icm_asset()
    target_ids = RecSys2020Reader.load_target()

    np.random.seed(12341288)
    URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8)
    # ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.995)
    evaluator_validation = EvaluatorHoldout(URM_test,
                                            cutoff_list=[10],
                                            exclude_seen=True)
    #URM_train = URM_all
    ICM_train = ICM_all

    URM_ICM_train = sps.vstack([URM_train, ICM_all.T])
    URM_ICM_train = URM_ICM_train.tocsr()

    l_list = []
    profile_length = np.ediff1d(URM_train.indptr)
    block_size = int(len(profile_length) * 0.2)
    sorted_users = np.argsort(profile_length)
    groups = 5
    rec_list = []
    arg_list = []
    name_list = []

    for group_id in range(0, groups):
        start_pos = group_id * block_size
        end_pos = min((group_id + 1) * block_size, len(profile_length))

        users_in_group = sorted_users[start_pos:end_pos]

        users_in_group_p_len = profile_length[users_in_group]
        l_list.append(len(users_in_group))

        print("Group {}, average p.len {:.2f}, min {}, max {}".format(
            group_id, users_in_group_p_len.mean(), users_in_group_p_len.min(),
            users_in_group_p_len.max()))

    hyb_warm = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF(
        URM_ICM_train, URM_ICM_train.T)
    hyb_warmV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(
        URM_ICM_train, URM_ICM_train.T)
    # Warm of Kaggle MAP 0.09466
    '''hyb_warm_args = {"topK_P": 127, "alpha_P": 0.35309465855346317, "normalize_similarity_P": False, "topK": 805,
                     "shrink": 307, "similarity": "tversky", "normalize": False, "alpha": 0.486665735781842, "feature_weighting": "TF-IDF"}
    hyb_warmV2_args = {"topK_P": 1496, "alpha_P": 0.4384309705759645, "normalize_similarity_P": False, "topK": 1023,
                       "shrink": 261, "similarity": "asymmetric", "normalize": False, "alpha": 0.7211670365702352, "feature_weighting": "TF-IDF"}'''
    hyb_warm_args = {
        "topK_P": 2000,
        "alpha_P": 0.5202318972174075,
        "normalize_similarity_P": False,
        "topK": 2000,
        "shrink": 2000,
        "similarity": "tversky",
        "normalize": True,
        "alpha": 1.0,
        "beta_P": 0.33040913500424834,
        "feature_weighting": "none"
    }
    hyb_warmV2_args = {
        "topK_P": 1238,
        "alpha_P": 0.580501466821829,
        "normalize_similarity_P": False,
        "topK": 1043,
        "shrink": 163,
        "similarity": "asymmetric",
        "normalize": False,
        "alpha": 0.25081946305309705,
        "feature_weighting": "BM25"
    }
    #{"topK_P": 2000, "alpha_P": 0.5292482627931302, "normalize_similarity_P": False, "topK": 2000, "shrink": 0,
    #"similarity": "tanimoto", "normalize": True, "alpha": 0.7963434906265208, "beta_P": 0.2692980157925566, "feature_weighting": "BM25"}

    hyb_cold = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF(
        URM_ICM_train, URM_ICM_train.T)
    # Cold of Kaggle MAP 0.09466
    hyb_coldV2 = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF(
        URM_ICM_train, URM_ICM_train.T)
    '''hyb_cold_args = {"topK_P": 482, "alpha_P": 0.4999498678468517, "normalize_similarity_P": False, "topK": 1500,
                     "shrink": 212, "similarity": "cosine", "normalize": False, "alpha": 0.6841610038073574,
                     "feature_weighting": "BM25"}
    # Cold of Kaggle MAP 0.09466
    hyb_coldV2_args = {"topK_P": 326, "alpha_P": 0.5120656418370607, "normalize_similarity_P": False, "topK": 151,
                       "shrink": 183, "similarity": "tversky", "normalize": True, "alpha": 0.6290067931193662, "feature_weighting": "BM25"}'''
    hyb_cold_args = {
        "topK_P": 2093,
        "alpha_P": 0.8263868403373367,
        "normalize_similarity_P": False,
        "topK": 298,
        "shrink": 1954,
        "similarity": "tanimoto",
        "normalize": False,
        "alpha": 0.608862998163905,
        "beta_P": 0.34975586706651757,
        "feature_weighting": "TF-IDF"
    }
    # Cold of Kaggle MAP 0.09466
    hyb_coldV2_args = {
        "topK_P": 1490,
        "alpha_P": 0.5832972099071866,
        "normalize_similarity_P": False,
        "topK": 1533,
        "shrink": 1100,
        "similarity": "tanimoto",
        "normalize": False,
        "alpha": 0.15358895478386428,
        "beta_P": 0.002234792201790459,
        "feature_weighting": "BM25"
    }
    '''hyb_midV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_ICM_train, URM_ICM_train.T)
    # Cold of Kaggle MAP 0.09466
    hyb_midV2_args = {"topK_P": 2064, "alpha_P": 1.9131180703120496, "normalize_similarity_P": False, "topK": 154, "shrink": 620,
                      "similarity": "asymmetric", "normalize": True, "alpha": 0.013221786654690208, "feature_weighting": "TF-IDF"}
    #{"topK_P": 1577, "alpha_P": 0.1835912052126545, "normalize_similarity_P": false, "topK": 1439, "shrink": 3626,
    #"similarity": "cosine", "normalize": false, "alpha": 0.1507714323088927, "feature_weighting": "BM25"}'''

    rec_list.append(hyb_cold)
    arg_list.append(hyb_cold_args)
    name_list.append("hyb_cold")
    rec_list.append(hyb_warm)
    arg_list.append(hyb_warm_args)
    name_list.append("hyb_warm")
    rec_list.append(hyb_warmV2)
    arg_list.append(hyb_warmV2_args)
    name_list.append("hyb_warmV2")
    rec_list.append(hyb_coldV2)
    arg_list.append(hyb_coldV2_args)
    name_list.append("hyb_coldV2")
    '''rec_list.append(hyb_midV2)
    arg_list.append(hyb_midV2_args)
    name_list.append("hyb_midV2")'''

    hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(
        URM_train, ICM_train)
    hyb5_args = {
        "topK_P": 903,
        "alpha_P": 0.4108657561671193,
        "normalize_similarity_P": False,
        "topK": 448,
        "shrink": 5,
        "similarity": "tversky",
        "normalize": True,
        "alpha": 0.6290871066510789,
        "feature_weighting": "TF-IDF"
    }
    rec_list.append(hyb5)
    arg_list.append(hyb5_args)
    name_list.append("hyb5")

    tot_args = zip(rec_list, arg_list, name_list)
    pool = PoolWithSubprocess(processes=5, maxtasksperchild=1)
    resultList = pool.map(fitRec, tot_args)
    pool.close()
    pool.join()

    for el in resultList:
        if el[1] == "hyb_cold":
            hyb_cold = el[0]
        elif el[1] == "hyb_warm":
            hyb_warm = el[0]
        elif el[1] == "hyb_coldV2":
            hyb_coldV2 = el[0]
        elif el[1] == "hyb_midV2":
            hyb_midV2 = el[0]
        elif el[1] == "hyb_warmV2":
            hyb_warmV2 = el[0]
        elif el[1] == "hyb5":
            hyb5 = el[0]
        elif el[1] == "hyb6x":
            hyb6x = el[0]

    # cold coldv2 mid sono i nuovi

    #hyb = hyb_warm

    #hyb2 = hyb_cold

    hyb3 = ScoresHybridKNNCFKNNCBF.ScoresHybridKNNCFKNNCBF(
        URM_ICM_train, URM_ICM_train.T)
    hyb3.fit(
        **{
            "topK_CF": 488,
            "shrink_CF": 1500,
            "similarity_CF": "tversky",
            "normalize_CF": True,
            "topK": 1500,
            "shrink": 1500,
            "similarity": "asymmetric",
            "normalize": False,
            "alpha": 0.23233349150222427,
            "feature_weighting": "BM25"
        })
    hyb2 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, hyb_warm, hyb5)
    hyb2.fit(alpha=0.5)

    hyb6 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, hyb_warmV2, hyb5)
    hyb6.fit(alpha=0.5)

    hyb7 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, hyb6, hyb2)
    hyb7.fit(alpha=0.5)

    #hyb = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, hyb3, hyb7)
    #hyb.fit(alpha=0.5)

    earlystopping_keywargs = {
        "validation_every_n": 1,
        "stop_on_validation": True,
        "evaluator_object": evaluator_validation,
        "lower_validations_allowed": 3,
        "validation_metric": "MAP",
    }

    ials = IALSRecommender.IALSRecommender(URM_ICM_train)
    ials.fit(**earlystopping_keywargs, num_factors=100, alpha=50)

    hyb = ials

    hyb7 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, hyb2, ials)
    hyb7.fit(alpha=0.5)

    hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, hyb2, ials)
    hyb3.fit(alpha=0.85)

    MAP_p3alpha_per_group = []
    MAP_itemKNNCF_per_group = []
    MAP_itemKNNCBF_per_group = []
    MAP_pureSVD_per_group = []
    MAP_hyb_per_group = []
    MAP_hyb2_per_group = []
    MAP_hyb3_per_group = []
    MAP_hyb5_per_group = []
    MAP_hyb6_per_group = []
    MAP_hyb7_per_group = []
    cutoff = 10
    args = {
        "block_size": block_size,
        "profile_length": profile_length,
        "sorted_users": sorted_users,
        "cutoff": cutoff,
        "URM_test": URM_test,
        "hyb": hyb,
        "hyb2": hyb2,
        "hyb3": hyb3,
        "hyb5": hyb5,
        "hyb6": hyb6,
        "hyb7": hyb7
    }

    pool = PoolWithSubprocess(processes=multiprocessing.cpu_count() - 1,
                              maxtasksperchild=1)
    compute_group_MAP_partial = partial(compute_group_MAP, args)
    resultList = pool.map(compute_group_MAP_partial, range(0, groups))
    pool.close()
    pool.join()
    for el in resultList:
        MAP_hyb_per_group.append(el[0])
        MAP_hyb2_per_group.append(el[1])
        MAP_hyb3_per_group.append(el[2])
        MAP_hyb5_per_group.append(el[3])
        MAP_hyb6_per_group.append(el[4])
        if hyb7 is not None:
            MAP_hyb7_per_group.append(el[5])

    # Needed because of memory error
    '''for group_id in range(0, groups):
        start_pos = group_id * block_size
        end_pos = min((group_id + 1) * block_size, len(profile_length))

        users_in_group = sorted_users[start_pos:end_pos]

        users_in_group_p_len = profile_length[users_in_group]

        print("Group {}, average p.len {:.2f}, min {}, max {}".format(group_id,
                                                                      users_in_group_p_len.mean(),
                                                                      users_in_group_p_len.min(),
                                                                      users_in_group_p_len.max()))

        users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert=True)
        users_not_in_group = sorted_users[users_not_in_group_flag]

        evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[cutoff], ignore_users=users_not_in_group)

        results, _ = evaluator_test.evaluateRecommender(hyb7)
        MAP_hyb7_per_group.append(results[cutoff]["MAP"])'''

    import matplotlib.pyplot as pyplot
    '''pyplot.plot(MAP_p3alpha_per_group, label="p3alpha")
    pyplot.plot(MAP_itemKNNCF_per_group, label="itemKNNCF")
    pyplot.plot(MAP_itemKNNCBF_per_group, label="itemKNNCBF")
    pyplot.plot(MAP_pureSVD_per_group, label="pureSVD")'''
    pyplot.plot(MAP_hyb_per_group, label="hyb")
    pyplot.plot(MAP_hyb2_per_group, label="hyb2")
    pyplot.plot(MAP_hyb3_per_group, label="hyb3")
    pyplot.plot(MAP_hyb5_per_group, label="hyb5")
    pyplot.plot(MAP_hyb6_per_group, label="hyb6")
    if hyb7 is not None:
        pyplot.plot(MAP_hyb7_per_group, label="hyb7")
    pyplot.ylabel('MAP')
    pyplot.xlabel('User Group')
    pyplot.legend()
    pyplot.show()

    print(l_list)
    evaluator_validation = EvaluatorHoldout(URM_test,
                                            cutoff_list=[10],
                                            exclude_seen=True)
    pool = PoolWithSubprocess(processes=multiprocessing.cpu_count() - 1,
                              maxtasksperchild=1)
    if hyb7 is not None:
        hyb_list = [hyb, hyb2, hyb3, hyb5, hyb6, hyb7]
    else:
        hyb_list = [hyb, hyb2, hyb3, hyb5, hyb6]
    resultList = pool.map(evaluator_validation.evaluateRecommender, hyb_list)
    pool.close()
    pool.join()
    for el in resultList:
        print(el)
    '''item_list = hyb7.recommend(target_ids, cutoff=10)
    CreateCSV.create_csv(target_ids, item_list, 'Hyb_URM_ICM_cold_warm_V2_more_mix_mid')
    item_list = hyb2.recommend(target_ids, cutoff=10)
    CreateCSV.create_csv(target_ids, item_list, 'Hyb2')
    item_list = hyb6.recommend(target_ids, cutoff=10)
    CreateCSV.create_csv(target_ids, item_list, 'Hyb_URM_ICM')'''

    print("--- Execution time: %s seconds ---" % (time.time() - start_time))
    return hyb2
def crossval(URM_all, ICM_all, target_ids, k):

    seed = 1234 + k  #+ int(time.time())
    np.random.seed()
    URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.90)
    ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.95)
    evaluator_validation = EvaluatorHoldout(URM_test,
                                            cutoff_list=[10],
                                            exclude_seen=True)
    args = {}

    p3alpha = P3alphaRecommender.P3alphaRecommender(URM_train)
    args = {
        "topK": 991,
        "alpha": 0.4705816992313091,
        "normalize_similarity": False
    }
    p3alpha.fit(**args)

    #p3alpha2 = P3alphaRecommender.P3alphaRecommender(URM_train)
    #args = {"topK": 400, "alpha": 0.5305816992313091, "normalize_similarity": False}
    #p3alpha2.fit(**args)

    #rp3beta = RP3betaRecommender.RP3betaRecommender(URM_train)
    #args = {"topK": 991, "alpha": 0.4705816992313091, "beta": 0.15, "normalize_similarity": False}
    #rp3beta.fit(**args)

    itemKNNCF = ItemKNNCFRecommender.ItemKNNCFRecommender(URM_train)
    args = {
        "topK": 1000,
        "shrink": 732,
        "similarity": "cosine",
        "normalize": True,
        "feature_weighting": "TF-IDF"
    }
    itemKNNCF.fit(**args)

    userKNNCF = UserKNNCFRecommender.UserKNNCFRecommender(URM_train)
    args = {
        "topK": 131,
        "shrink": 2,
        "similarity": "cosine",
        "normalize": True
    }
    userKNNCF.fit(**args)

    itemKNNCBF = ItemKNNCBFRecommender.ItemKNNCBFRecommender(
        URM_train, ICM_all)
    args = {
        "topK": 700,
        "shrink": 100,
        "similarity": 'jaccard',
        "normalize": True,
        "feature_weighting": "TF-IDF"
    }
    itemKNNCBF.fit(**args)

    itemKNNCBF2 = ItemKNNCBFRecommender.ItemKNNCBFRecommender(
        URM_train, ICM_all)
    args = {
        "topK": 200,
        "shrink": 15,
        "similarity": 'jaccard',
        "normalize": True,
        "feature_weighting": "TF-IDF"
    }
    itemKNNCBF2.fit(**args)

    #cfw = CFW_D_Similarity_Linalg.CFW_D_Similarity_Linalg(URM_train, ICM_train, itemKNNCF.W_sparse)
    #cfw.fit(show_max_performance=False, logFile=None, loss_tolerance=1e-6,
    #        iteration_limit=500000, damp_coeff=0.5, topK=900, add_zeros_quota=0.5, normalize_similarity=True)

    # Need to change bpr code to avoid memory error, useless since it's bad
    #bpr = SLIM_BPR_Cython(URM_train, recompile_cython=False)
    #bpr.fit(**{"topK": 1000, "epochs": 130, "symmetric": False, "sgd_mode": "adagrad", "lambda_i": 1e-05,
    #          "lambda_j": 0.01, "learning_rate": 0.0001})

    pureSVD = PureSVDRecommender.PureSVDRecommender(URM_train)
    pureSVD.fit(num_factors=340)

    #hyb = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, p3alpha, itemKNNCBF)
    #hyb.fit(alpha=0.5)
    hyb = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, itemKNNCBF, pureSVD)
    hyb.fit(alpha=0.5)

    # Kaggle MAP 0.084 rp3beta, itemKNNCBF
    #hyb2 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, p3alpha, itemKNNCBF)
    #hyb2.fit(alpha=0.5)
    hyb2 = ItemKNNSimilarityHybridRecommender.ItemKNNSimilarityHybridRecommender(
        URM_train, itemKNNCBF.W_sparse, itemKNNCF.W_sparse)
    hyb2.fit(topK=1600)

    # Kaggle MAP 0.08667
    hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(
        URM_train, hyb, hyb2)
    hyb3.fit(alpha=0.5)
    #hyb3 = RankingHybrid.RankingHybrid(URM_train, hyb, hyb2)

    #hyb3 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, p3alpha, userKNNCF)
    #hyb3.fit(alpha=0.5)

    hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(
        URM_train, ICM_all)
    # Kaggle MAP 0.08856
    args = {
        "topK_P": 903,
        "alpha_P": 0.4108657561671193,
        "normalize_similarity_P": False,
        "topK": 448,
        "shrink": 20,
        "similarity": "tversky",
        "normalize": True,
        "alpha": 0.6290871066510789,
        "feature_weighting": "TF-IDF"
    }
    hyb5.fit(**args)

    # hyb5.fit(**{"topK_P": 1000, "alpha_P": 0.5432601071314623, "normalize_similarity_P": True, "topK": 620, "shrink": 0,
    #             "similarity": "tversky", "normalize": False, "alpha": 0.5707347522847057, "feature_weighting": "BM25"})

    # Kaggle MAP 0.086 :(
    #hyb6 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, hyb3, hyb5)
    #hyb6.fit()
    hyb6 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(
        URM_train, ICM_all)
    args = {
        "topK_P": 756,
        "alpha_P": 0.5292654015790155,
        "normalize_similarity_P": False,
        "topK": 1000,
        "shrink": 47,
        "similarity": "tversky",
        "normalize": False,
        "alpha": 0.5207647439152092,
        "feature_weighting": "none"
    }
    hyb6.fit(**args)
    '''hyb6 = ScoresHybridRP3betaKNNCBF.ScoresHybridRP3betaKNNCBF(URM_train, ICM_all)
    args = {"topK_P": 623, "alpha_P": 0.5081918012150626, "normalize_similarity_P": False, "topK": 1000,
            "shrink": 1000, "similarity": "tversky", "normalize": True, "alpha": 0.44740093610861603, "beta_P": 0.0,
            "feature_weighting": "TF-IDF"}
    hyb6.fit(**args)'''

    hyb7 = RankingHybrid.RankingHybrid(URM_train, hyb6, hyb3)

    v0 = evaluator_validation.evaluateRecommender(hyb)[0][10]["MAP"]
    v1 = evaluator_validation.evaluateRecommender(hyb2)[0][10]["MAP"]
    v2 = evaluator_validation.evaluateRecommender(hyb3)[0][10]["MAP"]
    #v2 = 0
    v3 = evaluator_validation.evaluateRecommender(hyb5)[0][10]["MAP"]
    v4 = evaluator_validation.evaluateRecommender(hyb6)[0][10]["MAP"]
    #v4 = 0
    v5 = evaluator_validation.evaluateRecommender(hyb7)[0][10]["MAP"]

    #item_list = hyb6.recommend(target_ids, cutoff=10)
    #CreateCSV.create_csv(target_ids, item_list, 'HybPureSVD')

    return [v0, v1, v2, v3, v4, v5]
Esempio n. 7
0
def gethyb():
    start_time = time.time()

    URM_all, user_id_unique, item_id_unique = RecSys2020Reader.load_urm()
    ICM_all = RecSys2020Reader.load_icm_asset()
    target_ids = RecSys2020Reader.load_target()

    #np.random.seed(12341288)
    URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8)
    # ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.995)
    evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True)
    #URM_train = URM_all
    ICM_train = ICM_all

    URM_ICM_train = sps.vstack([URM_train, ICM_all.T])
    URM_ICM_train = URM_ICM_train.tocsr()

    l_list = []
    profile_length = np.ediff1d(URM_train.indptr)
    block_size = int(len(profile_length) * 0.2)
    sorted_users = np.argsort(profile_length)
    groups = 5
    rec_list = []
    arg_list = []
    name_list = []

    for group_id in range(0, groups):
        start_pos = group_id * block_size
        end_pos = min((group_id + 1) * block_size, len(profile_length))

        users_in_group = sorted_users[start_pos:end_pos]

        users_in_group_p_len = profile_length[users_in_group]
        l_list.append(len(users_in_group))

        print("Group {}, average p.len {:.2f}, min {}, max {}".format(group_id,
                                                                      users_in_group_p_len.mean(),
                                                                      users_in_group_p_len.min(),
                                                                      users_in_group_p_len.max()))

    hyb_warm = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, URM_train.T)
    hyb_warmV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train)
    # Warm of Kaggle MAP 0.09466
    '''hyb_warm_args = {"topK_P": 127, "alpha_P": 0.35309465855346317, "normalize_similarity_P": False, "topK": 805,
                     "shrink": 307, "similarity": "tversky", "normalize": False, "alpha": 0.486665735781842, "feature_weighting": "TF-IDF"}
    hyb_warmV2_args = {"topK_P": 1496, "alpha_P": 0.4384309705759645, "normalize_similarity_P": False, "topK": 1023,
                       "shrink": 261, "similarity": "asymmetric", "normalize": False, "alpha": 0.7211670365702352, "feature_weighting": "TF-IDF"}'''
    hyb_warm_args = {"topK_P": 1500, "alpha_P": 0.499386187332916, "normalize_similarity_P": False, "topK": 1500,
                     "shrink": 0, "similarity": "cosine", "normalize": False, "alpha": 0.6783844599810798, "feature_weighting": "BM25"}
    hyb_warmV2_args = {"topK_P": 1407, "alpha_P": 0.5102184063631549, "normalize_similarity_P": False, "topK": 62,
                       "shrink": 104, "similarity": "tanimoto", "normalize": False, "alpha": 0.7722938163027667, "feature_weighting": "none"}

    hyb_cold = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train)
    # Cold of Kaggle MAP 0.09466
    hyb_coldV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train)
    '''hyb_cold_args = {"topK_P": 482, "alpha_P": 0.4999498678468517, "normalize_similarity_P": False, "topK": 1500,
                     "shrink": 212, "similarity": "cosine", "normalize": False, "alpha": 0.6841610038073574,
                     "feature_weighting": "BM25"}
    # Cold of Kaggle MAP 0.09466
    hyb_coldV2_args = {"topK_P": 326, "alpha_P": 0.5120656418370607, "normalize_similarity_P": False, "topK": 151,
                       "shrink": 183, "similarity": "tversky", "normalize": True, "alpha": 0.6290067931193662, "feature_weighting": "BM25"}'''
    hyb_cold_args = {"topK_P": 510, "alpha_P": 0.2857363628982497, "normalize_similarity_P": False, "topK": 483,
                     "shrink": 1491, "similarity": "asymmetric", "normalize": True, "alpha": 0.7682805033640728, "feature_weighting": "TF-IDF"}
    # Cold of Kaggle MAP 0.09466
    hyb_coldV2_args = {"topK_P": 1095, "alpha_P": 0.4546298466859472, "normalize_similarity_P": False, "topK": 866,
                       "shrink": 182, "similarity": "tanimoto", "normalize": False, "alpha": 0.5837079437871213, "feature_weighting": "BM25"}
    hyb_midV2 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train)
    # Cold of Kaggle MAP 0.09466
    hyb_midV2_args = {"topK_P": 482, "alpha_P": 0.4999498678468517, "normalize_similarity_P": False, "topK": 1500,
                       "shrink": 212, "similarity": "cosine", "normalize": False, "alpha": 0.6841610038073574, "feature_weighting": "BM25"}

    rec_list.append(hyb_cold)
    arg_list.append(hyb_cold_args)
    name_list.append("hyb_cold")
    rec_list.append(hyb_warm)
    arg_list.append(hyb_warm_args)
    name_list.append("hyb_warm")
    rec_list.append(hyb_warmV2)
    arg_list.append(hyb_warmV2_args)
    name_list.append("hyb_warmV2")
    rec_list.append(hyb_coldV2)
    arg_list.append(hyb_coldV2_args)
    name_list.append("hyb_coldV2")
    rec_list.append(hyb_midV2)
    arg_list.append(hyb_midV2_args)
    name_list.append("hyb_midV2")

    hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_train)
    hyb5_args = {"topK_P": 903, "alpha_P": 0.4108657561671193, "normalize_similarity_P": False, "topK": 448,
                 "shrink": 5,
                 "similarity": "tversky", "normalize": True, "alpha": 0.6290871066510789, "feature_weighting": "TF-IDF"}
    rec_list.append(hyb5)
    arg_list.append(hyb5_args)
    name_list.append("hyb5")

    tot_args = zip(rec_list, arg_list, name_list)
    pool = PoolWithSubprocess(processes=int(multiprocessing.cpu_count()-1), maxtasksperchild=1)
    resultList = pool.map(fitRec, tot_args)
    pool.close()
    pool.join()

    for el in resultList:
        if el[1] == "hyb_cold":
            hyb_cold = el[0]
        elif el[1] == "hyb_warm":
            hyb_warm = el[0]
        elif el[1] == "hyb_coldV2":
            hyb_coldV2 = el[0]
        elif el[1] == "hyb_midV2":
            hyb_midV2 = el[0]
        elif el[1] == "hyb_warmV2":
            hyb_warmV2 = el[0]
        elif el[1] == "hyb5":
            hyb5 = el[0]
        elif el[1] == "hyb6x":
            hyb6x = el[0]


    hybuc = ScoresHybridSpecializedV3Warm.ScoresHybridSpecializedV3Warm(URM_train, ICM_all)
    hybuc.fit(**{"topK_P": 509, "alpha_P": 1.045671409326966, "normalize_similarity_P": False, "topK": 1291, "shrink": 430,
             "similarity": "asymmetric", "normalize": False, "alpha": 0.864672904054673, "feature_weighting": "TF-IDF"})

    hyb2 = hyb_warmV2

    hyb3 = ScoresHybridSpecializedFusion.ScoresHybridSpecializedFusion(URM_train, hyb_cold, hyb_warm, 5.9)

    hyb7 = ScoresHybridSpecializedFusion.ScoresHybridSpecializedFusion(URM_train, hyb_coldV2, hyb_warmV2, 5.9)

    hyb6 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, hyb7, hyb5)
    hyb6.fit(alpha=0.5)

    hyb2 = ItemKNNScoresHybridRecommender.ItemKNNScoresHybridRecommender(URM_train, hyb3, hyb6)
    hyb2.fit(alpha=0.5)

    hyb = ScoresHybridSpecializedFusion.ScoresHybridSpecializedFusion(URM_train, hyb2, hybuc, 300)

    MAP_p3alpha_per_group = []
    MAP_itemKNNCF_per_group = []
    MAP_itemKNNCBF_per_group = []
    MAP_pureSVD_per_group = []
    MAP_hyb_per_group = []
    MAP_hyb2_per_group = []
    MAP_hyb3_per_group = []
    MAP_hyb5_per_group = []
    MAP_hyb6_per_group = []
    MAP_hyb7_per_group = []
    cutoff = 10
    args = {"block_size": block_size, "profile_length": profile_length, "sorted_users": sorted_users, "cutoff": cutoff,
            "URM_test": URM_test, "hyb": hyb, "hyb2": hyb2, "hyb3": hyb3, "hyb5": hyb5, "hyb6": hyb6, "hyb7": hyb7}

    pool = PoolWithSubprocess(processes=multiprocessing.cpu_count()-1, maxtasksperchild=1)
    compute_group_MAP_partial = partial(compute_group_MAP, args)
    resultList = pool.map(compute_group_MAP_partial, range(0, groups))
    pool.close()
    pool.join()
    for el in resultList:
        MAP_hyb_per_group.append(el[0])
        MAP_hyb2_per_group.append(el[1])
        MAP_hyb3_per_group.append(el[2])
        MAP_hyb5_per_group.append(el[3])
        MAP_hyb6_per_group.append(el[4])
        if hyb7 is not None:
            MAP_hyb7_per_group.append(el[5])

    # Needed because of memory error
    '''for group_id in range(0, groups):
        start_pos = group_id * block_size
        end_pos = min((group_id + 1) * block_size, len(profile_length))

        users_in_group = sorted_users[start_pos:end_pos]

        users_in_group_p_len = profile_length[users_in_group]

        print("Group {}, average p.len {:.2f}, min {}, max {}".format(group_id,
                                                                      users_in_group_p_len.mean(),
                                                                      users_in_group_p_len.min(),
                                                                      users_in_group_p_len.max()))

        users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert=True)
        users_not_in_group = sorted_users[users_not_in_group_flag]

        evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[cutoff], ignore_users=users_not_in_group)

        results, _ = evaluator_test.evaluateRecommender(hyb7)
        MAP_hyb7_per_group.append(results[cutoff]["MAP"])'''


    import matplotlib.pyplot as pyplot

    '''pyplot.plot(MAP_p3alpha_per_group, label="p3alpha")
    pyplot.plot(MAP_itemKNNCF_per_group, label="itemKNNCF")
    pyplot.plot(MAP_itemKNNCBF_per_group, label="itemKNNCBF")
    pyplot.plot(MAP_pureSVD_per_group, label="pureSVD")'''
    pyplot.plot(MAP_hyb_per_group, label="hyb")
    pyplot.plot(MAP_hyb2_per_group, label="hyb2")
    pyplot.plot(MAP_hyb3_per_group, label="hyb3")
    pyplot.plot(MAP_hyb5_per_group, label="hyb5")
    pyplot.plot(MAP_hyb6_per_group, label="hyb6")
    if hyb7 is not None:
        pyplot.plot(MAP_hyb7_per_group, label="hyb7")
    pyplot.ylabel('MAP')
    pyplot.xlabel('User Group')
    pyplot.legend()
    pyplot.show()

    print(l_list)
    evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True)
    pool = PoolWithSubprocess(processes=multiprocessing.cpu_count()-1, maxtasksperchild=1)
    if hyb7 is not None:
        hyb_list = [hyb, hyb2, hyb3, hyb5, hyb6, hyb7]
    else:
        hyb_list = [hyb, hyb2, hyb3, hyb5, hyb6]
    resultList = pool.map(evaluator_validation.evaluateRecommender, hyb_list)
    pool.close()
    pool.join()
    for el in resultList:
        print(el)
    '''item_list = hyb7.recommend(target_ids, cutoff=10)
    CreateCSV.create_csv(target_ids, item_list, 'Hyb_URM_ICM_cold_warm_V2_more_mix_mid')
    item_list = hyb2.recommend(target_ids, cutoff=10)
    CreateCSV.create_csv(target_ids, item_list, 'Hyb2')
    item_list = hyb6.recommend(target_ids, cutoff=10)
    CreateCSV.create_csv(target_ids, item_list, 'Hyb_URM_ICM')'''

    print("--- Execution time: %s seconds ---" % (time.time() - start_time))
    return hyb2