Exemple #1
0
def get_ICM_all(reader: RecSys2019Reader):
    """
    It returns all the ICM_all after applying feature engineering

    :param reader: data splitter
    :return: return ICM_all
    """
    URM_all = reader.get_URM_all()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_all_dict = reader.get_loaded_ICM_dict()
    ICM_all_dict.pop("ICM_all")
    ICM_all_dict = apply_feature_engineering_ICM(
        ICM_all_dict,
        URM_all,
        UCM_all_dict,
        ICM_names_to_count=["ICM_sub_class"],
        UCM_names_to_list=["UCM_age"])
    ICM_all_dict = apply_filtering_ICM(
        ICM_all_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    ICM_all_dict = apply_transformation_ICM(ICM_all_dict,
                                            ICM_name_to_transform_mapper={
                                                "ICM_asset":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_price":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_item_pop":
                                                np.log1p,
                                                "ICM_sub_class_count":
                                                np.log1p,
                                                "ICM_age":
                                                lambda x: x**(1 / 2.5)
                                            })
    ICM_all_dict = apply_discretization_ICM(ICM_all_dict,
                                            ICM_name_to_bins_mapper={
                                                "ICM_asset": 200,
                                                "ICM_price": 200,
                                                "ICM_item_pop": 50,
                                                "ICM_sub_class_count": 50
                                            })
    ICM_all = build_ICM_all_from_dict(ICM_all_dict)
    return ICM_all
Exemple #2
0
def main():
    # Data loading
    root_data_path = os.path.join(get_project_root_path(), "data/")
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(
        data_reader,
        k_out_value=K_OUT,
        use_validation_set=False,
        allow_cold_users=ALLOW_COLD_USERS,
        force_new_split=True,
        seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all, _ = get_ICM_train_new(data_reader)
    UCM_all, _ = get_UCM_train_new(data_reader)

    # Ignoring users
    ignore_users = get_ignore_users(
        URM_train,
        data_reader.get_original_user_id_to_index_mapper(),
        lower_threshold=LOWER_THRESHOLD,
        upper_threshold=UPPER_THRESHOLD,
        ignore_non_target_users=IGNORE_NON_TARGET_USERS)
    evaluator = EvaluatorHoldout(URM_test,
                                 cutoff_list=[CUTOFF],
                                 ignore_users=ignore_users)

    # Model evaluation
    model = get_model(URM_train, ICM_all, UCM_all)
    print(evaluator.evaluateRecommender(model))
Exemple #3
0
def get_UCM_all(reader: RecSys2019Reader):
    URM_all = reader.get_URM_all()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_dict = reader.get_loaded_ICM_dict()
    UCM_all_dict = apply_feature_engineering_UCM(
        UCM_all_dict, URM_all, ICM_dict, ICM_names_to_UCM=["ICM_sub_class"])

    # These are useful feature weighting for UserCBF_CF_Warm
    UCM_all_dict = apply_transformation_UCM(UCM_all_dict,
                                            UCM_name_to_transform_mapper={
                                                "UCM_sub_class":
                                                lambda x: x / 2,
                                                "UCM_user_act": np.log1p
                                            })
    UCM_all_dict = apply_discretization_UCM(
        UCM_all_dict, UCM_name_to_bins_mapper={"UCM_user_act": 50})
    UCM_all = build_UCM_all_from_dict(UCM_all_dict)
    return UCM_all
Exemple #4
0
def main():
    args = get_arguments()

    # Data loading
    root_data_path = args.reader_path
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=K_OUT, allow_cold_users=ALLOW_COLD_USERS,
                                               use_validation_set=False, force_new_split=True, seed=args.seed)
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()

    # Remove interactions to users that has len == 1 to URM_train
    len_1_users_mask = np.ediff1d(URM_train.tocsr().indptr) == 1
    len_1_users = np.arange(URM_train.shape[0])[len_1_users_mask]

    URM_train = URM_train.tolil()
    URM_train[len_1_users, :] = 0
    URM_train = URM_train.tocsr()

    # Remove interactions to users that has len == 1 to URM_test
    len_1_users_mask = np.ediff1d(URM_test.tocsr().indptr) == 1
    len_1_users = np.arange(URM_test.shape[0])[len_1_users_mask]

    URM_test = URM_test.tolil()
    URM_test[len_1_users, :] = 0
    URM_test = URM_test.tocsr()

    UCM_all = get_UCM_train_cold(data_reader)

    ignore_users = get_ignore_users(URM_train, data_reader.get_original_user_id_to_index_mapper(),
                                    lower_threshold=args.lower_threshold, upper_threshold=args.upper_threshold,
                                    ignore_non_target_users=args.exclude_non_target)
    ignore_users = np.concatenate([ignore_users, len_1_users])

    # Setting evaluator
    cutoff_list = [10]
    evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list, ignore_users=ignore_users)

    # HP tuning
    print("Start tuning...")
    version_path = "../../report/hp_tuning/{}/".format(args.recommender_name)
    now = datetime.now().strftime('%b%d_%H-%M-%S')
    now = now + "_k_out_value_{}/".format(K_OUT)
    version_path = version_path + "/" + now

    run_parameter_search_user_demographic(URM_train=URM_train, UCM_object=UCM_all, UCM_name="UCM_all",
                                          recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name],
                                          evaluator_validation=evaluator,
                                          metric_to_optimize="MAP",
                                          output_folder_path=version_path,
                                          parallelizeKNN=True,
                                          n_cases=int(args.n_cases),
                                          n_random_starts=int(args.n_random_starts))

    print("...tuning ended")
Exemple #5
0
def read_split_load_data(k_out, allow_cold_users, seed):
    root_data_path = os.path.join(get_project_root_path(), "data/")
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(
        data_reader,
        k_out_value=k_out,
        use_validation_set=False,
        allow_cold_users=allow_cold_users,
        force_new_split=True,
        seed=seed)
    data_reader.load_data()
    return data_reader
Exemple #6
0
def get_ICM_all_new(reader: RecSys2019Reader):
    """
    It returns all the ICM_all after applying feature engineering

    :param reader: data splitter
    :return: return ICM_all
    """
    ICM_all_dict = reader.get_loaded_ICM_dict()
    ICM_all_dict.pop("ICM_all")
    ICM_all_dict = apply_filtering_ICM(
        ICM_all_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    # Apply useful transformation
    ICM_all_dict = apply_transformation_ICM(ICM_all_dict,
                                            ICM_name_to_transform_mapper={
                                                "ICM_asset":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_price":
                                                lambda x: np.log1p(1 / x),
                                                "ICM_item_pop":
                                                np.log1p
                                            })
    ICM_all_dict = apply_discretization_ICM(ICM_all_dict,
                                            ICM_name_to_bins_mapper={
                                                "ICM_asset": 200,
                                                "ICM_price": 200,
                                                "ICM_item_pop": 50
                                            })
    # Apply feature weighting
    ICM_all_dict = apply_transformation_ICM(
        ICM_all_dict,
        ICM_name_to_transform_mapper={
            "ICM_price": lambda x: x * 1.8474248499810804,
            "ICM_asset": lambda x: x * 1.2232716972721878,
            "ICM_sub_class": lambda x: x * 1.662671860026709,
            "ICM_item_pop": lambda x: x * 0.886528360392298
        })
    ICM_all = build_ICM_all_from_dict(ICM_all_dict)
    return ICM_all
Exemple #7
0
def main():
    args = get_arguments()

    # Data loading
    data_reader = RecSys2019Reader(args.reader_path)
    data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=3, use_validation_set=False,
                                               force_new_split=True, seed=args.seed)
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()

    if args.recommender_name == "sslim_bpr":
        ICM_all = get_ICM_train(data_reader)
        URM_train = sps.vstack([URM_train, ICM_all.T], format="csr")
    if args.recommender_name == "rp3beta_side":
        ICM_all = get_ICM_train(data_reader)
        URM_train = sps.vstack([URM_train, ICM_all.T], format="csr")
        URM_train = TF_IDF(URM_train).tocsr()
    if args.recommender_name == "pure_svd":
        URM_train = TF_IDF(URM_train).tocsr()
    if args.recommender_name == "pure_svd_side":
        ICM_all = get_ICM_train(data_reader)
        URM_train = sps.vstack([URM_train, ICM_all.T], format="csr")

    # Setting evaluator
    exclude_cold_users = args.exclude_users
    h = int(args.focus_on_high)
    fol = int(args.focus_on_low)
    if h != 0:
        print("Excluding users with less than {} interactions".format(h))
        ignore_users_mask = np.ediff1d(URM_train.tocsr().indptr) < h
        ignore_users = np.arange(URM_train.shape[0])[ignore_users_mask]
    elif fol != 0:
        print("Excluding users with more than {} interactions".format(fol))
        warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > fol
        ignore_users = np.arange(URM_train.shape[0])[warm_users_mask]
        if exclude_cold_users:
            cold_user_mask = np.ediff1d(URM_train.tocsr().indptr) == 0
            cold_users = np.arange(URM_train.shape[0])[cold_user_mask]
            ignore_users = np.unique(np.concatenate((cold_users, ignore_users)))
    elif exclude_cold_users:
        print("Excluding cold users...")
        cold_user_mask = np.ediff1d(URM_train.tocsr().indptr) == 0
        ignore_users = np.arange(URM_train.shape[0])[cold_user_mask]
    else:
        ignore_users = None

    cutoff_list = [10]
    evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list, ignore_users=ignore_users)

    # HP tuning
    print("Start tuning...")
    version_path = "../../report/hp_tuning/{}/".format(args.recommender_name)
    now = datetime.now().strftime('%b%d_%H-%M-%S')
    now = now + "_k_out_value_3/"
    version_path = version_path + "/" + now

    runParameterSearch_Collaborative(URM_train=URM_train,
                                     recommender_class=RECOMMENDER_CLASS_DICT[args.recommender_name],
                                     evaluator_validation=evaluator,
                                     metric_to_optimize="MAP",
                                     output_folder_path=version_path,
                                     n_cases=int(args.n_cases),
                                     n_random_starts=int(args.n_random_starts))
    print("...tuning ended")
Exemple #8
0
from datetime import datetime

from course_lib.Base.Evaluation.Evaluator import EvaluatorHoldout
from src.data_management.New_DataSplitter_leave_k_out import New_DataSplitter_leave_k_out
from src.data_management.RecSys2019Reader import RecSys2019Reader
from src.data_management.dataframe_preprocesser import get_preprocessed_dataframe
from src.tuning.holdout_validation.run_parameter_search_advanced_top_pop import run_parameter_search_advanced_top_pop
from src.utils.general_utility_functions import get_split_seed

if __name__ == '__main__':
    # Data loading
    data_reader = RecSys2019Reader("../../data/")
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=3,
                                               use_validation_set=False,
                                               force_new_split=True,
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    mapper = data_reader.get_original_user_id_to_index_mapper()
    df = get_preprocessed_dataframe("../../data/", keep_warm_only=True)

    # Setting evaluator
    # warm_users_mask = np.ediff1d(URM_train.tocsr().indptr) > 0
    # warm_users = np.arange(URM_train.shape[0])[warm_users_mask]
    # ignore_users = warm_users
    cutoff_list = [10]
    evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list)

    # HP tuning
    print("Start tuning...")
Exemple #9
0
from course_lib.Base.NonPersonalizedRecommender import TopPop
from scripts.fm_model.write_ffm_data_uncompressed import get_ICM_with_fields, get_UCM_with_fields
from scripts.scripts_utils import set_env_variables
from src.data_management.DataPreprocessing import DataPreprocessingRemoveColdUsersItems
from src.data_management.New_DataSplitter_leave_k_out import *
from src.data_management.RecSys2019Reader import RecSys2019Reader
from src.model import new_best_models
from src.model.FactorizationMachine.FieldAwareFMRecommender import FieldAwareFMRecommender
from src.utils.general_utility_functions import get_split_seed, get_project_root_path

if __name__ == '__main__':
    set_env_variables()

    # Data loading
    root_data_path = "../data/"
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = DataPreprocessingRemoveColdUsersItems(data_reader,
                                                        threshold_users=25,
                                                        threshold_items=20)
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=1,
                                               use_validation_set=False,
                                               force_new_split=True,
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()

    # Build ICMs
    ICM_all, item_feature_fields = get_ICM_with_fields(data_reader)

    # Build UCMs: do not change the order of ICMs and UCMs
Exemple #10
0
from src.data_management.RecSys2019Reader import RecSys2019Reader
from course_lib.Data_manager.DataSplitter_k_fold import DataSplitter_Warm_k_fold
from course_lib.Base.Evaluation.Evaluator import *
from course_lib.ParameterTuning.run_parameter_search import *
from course_lib.Notebooks_utils.data_splitter import train_test_holdout

if __name__ == '__main__':
    # Data loading
    dataset = RecSys2019Reader("../data/train.csv", "../data/tracks.csv")
    dataset = DataSplitter_Warm_k_fold(dataset, n_folds=10)
    dataset.load_data()
    URM_train, URM_test = dataset.get_URM_train_for_test_fold(n_test_fold=8)

    # Hyperparameter tuning
    cutoff_list = [10]
    evaluator = EvaluatorHoldout(URM_test, cutoff_list=cutoff_list)

    fake_URM_train, subset_URM_test = train_test_holdout(URM_test,
                                                         train_perc=0.95)
    evaluator_early_stopping = EvaluatorHoldout(subset_URM_test,
                                                cutoff_list=cutoff_list)

    # Sarebbe meglio un subset della matrice per fare early stopping, in modo che vada molto più velcoe...

    print("Start tuning...")
    runParameterSearch_Collaborative(
        URM_train=URM_train,
        recommender_class=SLIM_BPR_Cython,
        evaluator_validation=evaluator,
        evaluator_validation_earlystopping=evaluator_early_stopping,
        metric_to_optimize="MAP",
Exemple #11
0
from src.data_management.New_DataSplitter_leave_k_out import *
from src.data_management.RecSys2019Reader import RecSys2019Reader
from src.data_management.RecSys2019Reader_utils import get_ICM_numerical, merge_UCM
from src.data_management.data_getter import get_warmer_UCM
from src.model.FallbackRecommender.AdvancedTopPopular import AdvancedTopPopular
from src.plots.recommender_plots import *
from src.data_management.dataframe_preprocesser import get_preprocessed_dataframe
from src.model import best_models
from src.utils.general_utility_functions import get_split_seed

if __name__ == '__main__':
    # Data reading
    data_reader = RecSys2019Reader()
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=3,
                                               use_validation_set=False,
                                               force_new_split=True,
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()

    mapper = data_reader.SPLIT_GLOBAL_MAPPER_DICT['user_original_ID_to_index']
    df = get_preprocessed_dataframe("../../data/", keep_warm_only=True)

    # Build ICMs
    ICM_numerical, _ = get_ICM_numerical(data_reader.dataReader_object)
    ICM_categorical = data_reader.get_ICM_from_name("ICM_sub_class")

    # Build UCMs
    URM_all = data_reader.dataReader_object.get_URM_all()
    UCM_age = data_reader.dataReader_object.get_UCM_from_name("UCM_age")
Exemple #12
0
from course_lib.KNN.UserKNNCFRecommender import UserKNNCFRecommender
from src.data_management.RecSys2019Reader import RecSys2019Reader
from course_lib.Data_manager.DataSplitter_k_fold import DataSplitter_Warm_k_fold

if __name__ == '__main__':
    dataset = RecSys2019Reader("../../data/")
    dataset = DataSplitter_Warm_k_fold(dataset, n_folds=10)
    dataset.load_data()
    URM_train, URM_test = dataset.get_URM_train_for_test_fold(n_test_fold=9)
    model = UserKNNCFRecommender(URM_train)
    model.fit()
    print("The recommendation for user 1 is: {}".format(
        model.recommend(1, cutoff=10)))
Exemple #13
0
# + {"pycharm": {"name": "#%%\n", "is_executing": false}}
df_target.head()

# + {"pycharm": {"name": "#%%\n", "is_executing": false}}
target_users = df_target.user_id.values
target_users

# + {"pycharm": {"name": "#%%\n", "is_executing": false}}
print("There are %d users in the target users" % len(target_users))
# -

# ## Analyze target users w.r.t. URM

# + {"pycharm": {"name": "#%%\n", "is_executing": false}}
dataset = RecSys2019Reader()
dataset.load_data()

# + {"pycharm": {"name": "#%%\n", "is_executing": false}}
URM_all = dataset.get_URM_all()
URM_all

# + {"pycharm": {"name": "#%%\n", "is_executing": false}}
URM_user_mapper = dataset.get_user_original_ID_to_index_mapper()
original_users_URM = list(URM_user_mapper.keys())

# + {"pycharm": {"name": "#%%\n", "is_executing": false}}
mask = np.in1d(target_users, original_users_URM, assume_unique=True)
missing_users = target_users[~mask]
missing_users
Exemple #14
0
def get_UCM_all_new(reader: RecSys2019Reader):
    URM_all = reader.get_URM_all()
    UCM_all_dict = reader.get_loaded_UCM_dict()
    ICM_dict = reader.get_loaded_ICM_dict()

    # Preprocess ICM
    ICM_dict.pop("ICM_all")
    ICM_dict = apply_feature_engineering_ICM(
        ICM_dict,
        URM_all,
        UCM_all_dict,
        ICM_names_to_count=["ICM_sub_class"],
        UCM_names_to_list=["UCM_age"])
    ICM_dict = apply_filtering_ICM(
        ICM_dict,
        ICM_name_to_filter_mapper={
            "ICM_asset":
            lambda x: x < np.quantile(x, q=0.75) + 0.72 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25)),
            "ICM_price":
            lambda x: x < np.quantile(x, q=0.75) + 4 *
            (np.quantile(x, q=0.75) - np.quantile(x, q=0.25))
        })
    ICM_dict = apply_transformation_ICM(ICM_dict,
                                        ICM_name_to_transform_mapper={
                                            "ICM_asset":
                                            lambda x: np.log1p(1 / x),
                                            "ICM_price":
                                            lambda x: np.log1p(1 / x),
                                            "ICM_item_pop": np.log1p,
                                            "ICM_sub_class_count": np.log1p,
                                            "ICM_age": lambda x: x**(1 / 2.5)
                                        })
    ICM_dict = apply_discretization_ICM(ICM_dict,
                                        ICM_name_to_bins_mapper={
                                            "ICM_asset": 200,
                                            "ICM_price": 200,
                                            "ICM_item_pop": 50,
                                            "ICM_sub_class_count": 50
                                        })

    # Preprocess UCM
    UCM_all_dict = apply_feature_engineering_UCM(
        UCM_all_dict,
        URM_all,
        ICM_dict,
        ICM_names_to_UCM=["ICM_sub_class", "ICM_item_pop"])
    UCM_all_dict = apply_feature_entropy_UCM(
        UCM_all_dict, UCM_names_to_entropy=["UCM_sub_class"])
    # Apply useful transformation
    UCM_all_dict = apply_transformation_UCM(
        UCM_all_dict, UCM_name_to_transform_mapper={"UCM_user_act": np.log1p})

    UCM_all_dict = apply_discretization_UCM(UCM_all_dict,
                                            UCM_name_to_bins_mapper={
                                                "UCM_user_act": 50,
                                                "UCM_sub_class_entropy": 20
                                            })

    UCM_all = build_UCM_all_from_dict(UCM_all_dict)
    return UCM_all