Ejemplo n.º 1
0
    def test_add_UCM_information(self):
        n_users = 5000
        user_id_array = np.arange(n_users)

        df = get_boosting_base_dataframe(user_id_array,
                                         self.main_rec,
                                         cutoff=self.cutoff)
        label_array, _, _ = get_label_array(df, self.URM_train)
        df['label'] = label_array

        new_df = add_UCM_information(
            df, self.data_reader.get_original_user_id_to_index_mapper(),
            self.path)

        UCM_age = self.data_reader.get_UCM_from_name("UCM_age")
        age_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name(
            "UCM_age")
        age_demographic = get_user_demographic(UCM_age, age_mapper)

        UCM_region = self.data_reader.get_UCM_from_name("UCM_region")
        region_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name(
            "UCM_region")
        id_to_original_region_mapper = {
            v: int(k)
            for k, v in region_mapper.items()
        }
        for i in range(len(new_df)):
            user = new_df['user_id'].iloc[i]

            # Test age
            age = new_df['age'].iloc[i]
            age_imputed_flag = new_df['age_imputed_flag'].iloc[i]

            if age_demographic[user] == -1:
                assert age_imputed_flag == 1
                assert age == 5  # Imputed value (mode + 1)
            else:
                assert age_imputed_flag == 0
                assert age == age_demographic[user]

            # Test region
            true_regions = UCM_region.indices[
                UCM_region.indptr[user]:UCM_region.indptr[user + 1]]
            true_regions = [
                id_to_original_region_mapper[true_region]
                for true_region in true_regions
            ]
            for region in id_to_original_region_mapper.values():
                column_name = "region_{}".format(region)
                region_in_newdf = new_df[column_name].iloc[i]

                if region in true_regions:
                    assert region_in_newdf == 1, "User {} has not correct region {}".format(
                        user, region)
                else:
                    assert region_in_newdf == 0, "User {} has not correct region {}".format(
                        user, region)
Ejemplo n.º 2
0
    def __init__(self,
                 URM_train,
                 UCM_age,
                 ICM_subclass,
                 subclass_feature_to_id_mapper,
                 age_mapper_to_original,
                 recommender: BaseRecommender,
                 rerank_top_n=10):
        # Data
        self.URM = URM_train

        # Retrieving age information
        self.age_demographic = get_user_demographic(UCM_age,
                                                    age_mapper_to_original,
                                                    binned=True)
        age_list = np.sort(np.array(list(age_mapper_to_original.keys())))
        self.age_list = [int(age) for age in age_list]

        # Subclass information
        self.subclass_content_dict = get_sub_class_content(
            ICM_subclass, subclass_feature_to_id_mapper, binned=True)
        self.subclass_content = get_sub_class_content(
            ICM_subclass, subclass_feature_to_id_mapper, binned=False)

        # Age-Subclass
        self.sub_age_dict = {}
        self.count_sub_ace_dict = {}

        # Inner recommender
        self.inner_recommender = recommender
        self.rerank_top_n = rerank_top_n

        # Recommender parameters
        self.filter_subclass_age = None

        self.filter_subclass_user = None
        self.min_num_ratings_subclass_user = None
        self.users_subclass = np.array([])

        self.subclass_rerank = None
        self.min_num_ratings_subclass_rerank = None
        self.max_ratings_user_subclass_rerank = None

        self.filter_price_per_user = None
        self.filter_asset_per_user = None
        self.filter_price_per_age = None
        self.filter_asset_per_age = None

        super().__init__(URM_train)
Ejemplo n.º 3
0
    else:
        f = None

    # Data loading
    root_data_path = "../../data/"
    data_reader = RecSys2019Reader(root_data_path)
    data_reader = New_DataSplitter_leave_k_out(data_reader, k_out_value=1, use_validation_set=False,
                                               force_new_split=True, seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    ICM_all = get_ICM_train(data_reader)
    UCM_all = get_UCM_train(data_reader)

    UCM_age = data_reader.get_UCM_from_name("UCM_age")
    age_feature_to_id_mapper = data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name("UCM_age")
    age_demographic = get_user_demographic(UCM_age, age_feature_to_id_mapper, binned=True)

    ICM_subclass = data_reader.get_ICM_from_name("ICM_sub_class")
    subclass_feature_to_id_mapper = data_reader.dataReader_object.get_ICM_feature_to_index_mapper_from_name(
        "ICM_sub_class")
    subclass_content_dict = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=True)
    subclass_content = get_sub_class_content(ICM_subclass, subclass_feature_to_id_mapper, binned=False)

    # DOES SUBCLASS DISTRIBUTION CHANGES BETWEEN AGES?
    # Collect distributions
    age_list = np.sort(np.array(list(age_feature_to_id_mapper.keys())))
    age_list = [int(age) for age in age_list]
    sub_age_dict = {}
    for age in age_list:
        users_age = get_users_of_age(age_demographic=age_demographic, age_list=[age])
        URM_age = URM_train[users_age].copy()
Ejemplo n.º 4
0
    UCM_age_region = get_warmer_UCM(UCM_age_region, URM_all, threshold_users=3)
    UCM_all, _ = merge_UCM(UCM_age_region, URM_train, {}, {})

    ICM_categorical = data_reader.get_ICM_from_name("ICM_sub_class")
    ICM_all, _ = merge_ICM(ICM_categorical, URM_train.T, {}, {})

    # Model definition and fitting
    model = best_models.IALS.get_model(URM_train)

    version_path = "../../report/graphics/ials/"
    now = datetime.now().strftime('%b%d_%H-%M-%S')
    now = now + "_k_out_value_3/"
    version_path = version_path + "/" + now

    # Plots
    demographic_age = get_user_demographic(UCM_age, URM_all, 3)
    demographic_region = get_user_demographic(UCM_region, URM_all, 3)
    demographic_list = [demographic_age, demographic_region]
    demographic_list_name = ['age', 'region']

    basic_plots_recommender(model,
                            URM_train,
                            URM_test,
                            output_path_folder=version_path,
                            save_on_file=True,
                            compare_top_pop_points=None,
                            is_compare_top_pop=True,
                            demographic_list=demographic_list,
                            demographic_list_name=demographic_list_name)
Ejemplo n.º 5
0
if __name__ == '__main__':
    data_reader = RecSys2019Reader("../../data/")
    data_reader = New_DataSplitter_leave_k_out(data_reader,
                                               k_out_value=3,
                                               use_validation_set=False,
                                               force_new_split=True,
                                               seed=get_split_seed())
    data_reader.load_data()
    URM_train, URM_test = data_reader.get_holdout_split()
    URM_all = data_reader.dataReader_object.get_URM_all()

    path = "../../report/hp_tuning/p3alpha/Nov23_14-29-55_k_out_value_3/"

    UCM_region = data_reader.dataReader_object.get_UCM_from_name('UCM_region')
    region_demographic = get_user_demographic(UCM_region, URM_all, 3)

    UCM_age = data_reader.dataReader_object.get_UCM_from_name('UCM_age')
    age_demographic = get_user_demographic(UCM_age, URM_all, 3)

    demographics = [region_demographic, age_demographic]
    demographics_names = ["region", "age"]

    basic_plots_from_tuning_results(
        path,
        P3alphaRecommender,
        URM_train,
        URM_test,
        save_on_file=True,
        demographic_list=demographics,
        demographic_list_name=demographics_names,
Ejemplo n.º 6
0
    def test_get_train_dataframe_proportion(self):
        n_users = 500
        user_id_array = np.arange(n_users)

        df = get_train_dataframe_proportion(
            user_id_array,
            self.cutoff,
            self.main_rec,
            self.path,
            mapper=self.data_reader.get_original_user_id_to_index_mapper(),
            recommender_list=[self.main_rec],
            URM_train=self.URM_train,
            proportion=1)

        # Test that the df is ordered by user_id
        users = df['user_id'].values
        assert np.all(users[i] <= users[i + 1] for i in range(users.size - 1))

        # Test get_boosting_base_dataframe
        unique_users, user_indptr = np.unique(users, return_index=True)
        user_indptr = np.concatenate([user_indptr, [users.size]])
        true_recommendations = np.array(
            self.main_rec.recommend(user_id_array=user_id_array,
                                    cutoff=self.cutoff,
                                    remove_seen_flag=True))
        user_recommendations_items = true_recommendations.reshape(
            (true_recommendations.size, 1)).squeeze()

        flag = False
        for i, user in enumerate(user_id_array):
            df_items = df['item_id'].iloc[
                user_indptr[user]:user_indptr[user]].values
            true_items = user_recommendations_items[i * self.cutoff:i *
                                                    self.cutoff + self.cutoff]
            if np.any(np.in1d(df_items, true_items, assume_unique=True)):
                flag = True
                break
        assert flag == False

        # Test labels value
        labels = np.array(
            self.URM_train[df['user_id'].values,
                           df['item_id'].values].tolist()).flatten()
        assert np.array_equal(labels, df['label'].values)

        # Test recommender predictions
        all_scores = self.main_rec._compute_item_score(user_id_array)
        scaler = MinMaxScaler()
        scaler.fit(all_scores.reshape(-1, 1))
        all_scores = np.reshape(scaler.transform(all_scores.reshape(-1, 1)),
                                newshape=all_scores.shape)

        for i in range(len(df)):
            user = df['user_id'].iloc[i]
            item = df['item_id'].iloc[i]
            score = df[self.main_rec.RECOMMENDER_NAME].iloc[i]

            assert score == all_scores[user, item]

        # Test advanced subclass

        # Test ICM information

        # Test UCM information
        UCM_age = self.data_reader.get_UCM_from_name("UCM_age")
        age_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name(
            "UCM_age")
        age_demographic = get_user_demographic(UCM_age, age_mapper)

        UCM_region = self.data_reader.get_UCM_from_name("UCM_region")
        region_mapper = self.data_reader.dataReader_object.get_UCM_feature_to_index_mapper_from_name(
            "UCM_region")
        id_to_original_region_mapper = {
            v: int(k)
            for k, v in region_mapper.items()
        }
        for i in range(len(df)):
            user = df['user_id'].iloc[i]

            # Test age
            age = df['age'].iloc[i]
            age_imputed_flag = df['age_imputed_flag'].iloc[i]

            if age_demographic[user] == -1:
                assert age_imputed_flag == 1
                assert age == 5  # Imputed value (mode + 1)
            else:
                assert age_imputed_flag == 0
                assert age == age_demographic[user]

            # Test region
            true_regions = UCM_region.indices[
                UCM_region.indptr[user]:UCM_region.indptr[user + 1]]
            true_regions = [
                id_to_original_region_mapper[true_region]
                for true_region in true_regions
            ]
            for region in id_to_original_region_mapper.values():
                column_name = "region_{}".format(region)
                region_in_newdf = df[column_name].iloc[i]

                if region in true_regions:
                    assert region_in_newdf == 1, "User {} has not correct region {}".format(
                        user, region)
                else:
                    assert region_in_newdf == 0, "User {} has not correct region {}".format(
                        user, region)

        # Test user_activity
        for i in range(len(df)):
            user = df['user_id'].iloc[i]
            user_profile_len = df['user_act'].iloc[i]

            true_user_profile_len = len(
                self.URM_train.indices[self.URM_train.indptr[user]:self.
                                       URM_train.indptr[user + 1]])

            assert user_profile_len == true_user_profile_len

        # Test item_popularity
        URM_train_csc = self.URM_train.tocsc()
        for i in range(len(df)):
            item = df['item_id'].iloc[i]
            item_pop = df['item_pop'].iloc[i]

            true_item_pop = len(
                URM_train_csc.indices[URM_train_csc.indptr[item]:URM_train_csc.
                                      indptr[item + 1]])

            assert item_pop == true_item_pop