def __init__(self): self.HYP = {} self.report_counter = 60 self.writer = Writer() # Some parameters self.hyperparams = dict() self.hyperparams_names = list() self.hyperparams_values = list() self.hyperparams_single_value = dict() # Extractor for matricies extractor = Extractor() urm = extractor.get_urm_all() self.icm = extractor.get_icm_all() # Splitting into post-validation & testing in case of parameter tuning matrices = loo.split_train_leave_k_out_user_wise(urm, 1, False, True) self.urm_post_validation = matrices[0] self.urm_test = matrices[1] # Splitting the post-validation matrix in train & validation # (Problem of merging train and validation again at the end => loo twice) matrices_for_validation = loo.split_train_leave_k_out_user_wise( self.urm_post_validation, 1, False, True) self.urm_train = matrices_for_validation[0] self.urm_validation = matrices_for_validation[1]
def __init__(self, cutoff, cbfknn=False, icfknn=False, ucfknn=False, slim_bpr=False, pure_svd=False, als=False, cfw=False, p3a=False, rp3b=False, slim_en=False): """ Initialization of the generic runner in which we decide whether or not use an algorithm """ self.cutoff = cutoff self.cbfknn = cbfknn self.icfknn = icfknn self.ucfknn = ucfknn self.slim_bpr = slim_bpr self.pure_svd = pure_svd self.als = als self.cfw = cfw self.p3a = p3a self.rp3b = rp3b self.slim_en = slim_en self.writer = Writer self.extractor = Extractor() self.df_builder = XGBoostDataframe(self.cutoff) self.result_dict = None self.urm_train = None self.urm_validation = None self.icm = self.extractor.get_icm_all() self.p_cbfknn = None self.p_icfknn = None self.p_ucfknn = None self.p_slimbpr = None self.p_puresvd = None self.p_als = None self.p_cfw = None self.p_p3a = None self.p_rp3b = None self.p_slimen = None self.target_users = [] self.results = [] self.df_user_id_col = [] self.df_item_id_col = [] self.df_train = pd.DataFrame self.df_test = pd.DataFrame
def run(self, is_test, is_SSLIM): """ From here we start each algorithm. :param is_test: specifies if we want to write a report or a submission """ self.is_test = is_test self.is_SSLIM = is_SSLIM if self.is_test: extractor = Extractor() urm = extractor.get_urm_all() self.icm = extractor.get_icm_all() # Splitting into post-validation & testing in case of parameter tuning matrices = loo.split_train_leave_k_out_user_wise( urm, 1, False, True) self.urm_post_validation = matrices[0] self.urm_test = matrices[1] # Splitting the post-validation matrix in train & validation # (Problem of merging train and validation again at the end => loo twice) matrices_for_validation = loo.split_train_leave_k_out_user_wise( self.urm_post_validation, 1, False, True) self.urm_train = matrices_for_validation[0] self.urm_validation = matrices_for_validation[1] self.urm_train = extractor.preprocess_csr_matrix(self.urm_train) self.write_report() if self.is_SSLIM: # for topK in [50, 100, 200]: # for epochs in [10, 20, 50, 100, 200, 300]: self.sslim_pars = WeightConstants.SLIM_BPR_ICM slim_bpr = SLIM_BPR_Cython(self.icm.copy()) slim_bpr.fit(**self.sslim_pars) self.icm = slim_bpr.recs.copy().tocsr() self.evaluate() else: self.evaluate() else: extractor = Extractor() users = extractor.get_target_users_of_recs() self.urm_train = extractor.get_urm_all() self.icm = extractor.get_icm_all() self.write_submission(users)
def __init__(self, dataframe, group_length): self.builder = Builder() extractor = Extractor() self.dataframe = dataframe self.group_length = group_length self.urm = extractor.get_urm_all() self.icm = extractor.get_icm_all() self.users = extractor.get_target_users_of_recs() self.df_user_id_col = list(self.dataframe.loc[:, 'user_id']) self.df_item_id_col = list(self.dataframe.loc[:, 'item_id']) # Conversion from list of strings in list of int self.df_user_id_col = [int(i) for i in self.df_user_id_col] self.df_item_id_col = [int(i) for i in self.df_item_id_col]
def run(self, is_test): self.is_test = is_test if self.is_test: extractor = Extractor() builder = Builder() urm = extractor.get_urm_all() self.icm = extractor.get_icm_all() # Splitting into post-validation & testing in case of parameter tuning matrices = loo.split_train_leave_k_out_user_wise( urm, 1, False, True) self.urm_post_validation = matrices[0] self.urm_test = matrices[1] # Splitting the post-validation matrix in train & validation # (Problem of merging train and validation again at the end => loo twice) matrices_for_validation = loo.split_train_leave_k_out_user_wise( self.urm_post_validation, 1, False, True) self.urm_train = matrices_for_validation[0] self.urm_validation = matrices_for_validation[1] # Building the urm_per_feature lists if self.users_per_region: self.urm_per_region_list = builder.build_per_region_urm_train( self.urm_train) if self.users_per_age: self.urm_per_age_list = builder.build_per_age_urm_train( self.urm_train) self.write_report() self.evaluate() else: extractor = Extractor() builder = Builder() users = extractor.get_target_users_of_recs() self.urm_train = extractor.get_urm_all() self.icm = extractor.get_icm_all() # Building the urm_per_feature lists if self.users_per_region: self.urm_per_region_list = builder.build_per_region_urm_train( self.urm_train) if self.users_per_age: self.urm_per_age_list = builder.build_per_age_urm_train( self.urm_train) self.write_submission(users)
def __init__(self, urm_train, icm, urm_per_region_list, urm_per_age_list, weights, add_pure_svd=False, add_slim_bpr=False): self.urm_train = urm_train self.urm_per_region_list = urm_per_region_list self.urm_per_age_list = urm_per_age_list self.icm = icm self.add_pure_svd = add_pure_svd self.add_slim_bpr = add_slim_bpr self.weights = weights self.icfknn_list = [] self.ucfknn_list = [] self.icbfknn_list = [] self.icm_bm25 = self.icm.copy().astype(np.float32) self.icm_bm25 = okapi_BM_25(self.icm_bm25) self.icm_bm25 = self.icm_bm25.tocsr() self.ratings = None self.extractor = Extractor() # Creation of the list of algortms that have to be used if self.urm_per_region_list is not None: for urm in self.urm_per_region_list: sps.csr_matrix(urm) self.icfknn_list.append(ItemCFKNNRecommender(urm.copy())) self.ucfknn_list.append(UserCFKNNRecommender(urm.copy())) self.icbfknn_list.append( ItemCBFKNNRecommender(urm.copy(), self.icm_bm25.copy())) if self.urm_per_age_list is not None: for urm in self.urm_per_age_list: sps.csr_matrix(urm) self.icfknn_list.append(ItemCFKNNRecommender(urm.copy())) self.ucfknn_list.append(UserCFKNNRecommender(urm.copy())) self.icbfknn_list.append( ItemCBFKNNRecommender(urm.copy(), self.icm_bm25.copy())) # self.icfknn_list.append(ItemCFKNNRecommender(self.urm_train.copy())) # self.ucfknn_list.append(UserCFKNNRecommender(self.urm_train.copy())) # self.icbfknn_list.append(ItemCBFKNNRecommender(self.urm_train.copy(), self.icm_bm25.copy())) if self.add_pure_svd: self.pure_SVD = PureSVDRecommender(self.urm_train.copy()) if self.add_slim_bpr: self.slim_bpr = SLIM_BPR_Cython(self.urm_train.copy())
def run(self, is_test): """ From here we start each algorithm. :param is_test: specifies if we want to write a report or a submission """ self.is_test = is_test if self.is_test: extractor = Extractor() urm = extractor.get_urm_all() self.icm = extractor.get_icm_all() # self.icm_dirty = extractor.get_icm_price_dirty() # Splitting into post-validation & testing in case of parameter tuning matrices = loo.split_train_leave_k_out_user_wise( urm, 1, False, True) self.urm_post_validation = matrices[0] self.urm_test = matrices[1] # ONLY TRAIN AND TEST self.urm_train = self.urm_post_validation # Splitting the post-validation matrix in train & validation # (Problem of merging train and validation again at the end => loo twice) # matrices_for_validation = loo.split_train_leave_k_out_user_wise(self.urm_post_validation, 1, False, True) # self.urm_train = matrices_for_validation[0] # self.urm_validation = matrices_for_validation[1] self.evaluate() else: extractor = Extractor() users = extractor.get_target_users_of_recs() self.urm_train = extractor.get_urm_all() #self.icm = extractor.get_icm_all() self.write_submission(users)
def data_visualization(): # Retriving variables userList = list(Extractor().get_users(True)) itemList = list(Extractor().get_tracks(True, True)) userList_unique = list(set(userList)) itemList_unique = list(set(itemList)) numUsers = len(userList_unique) numItems = len(itemList_unique) numberInteractions = Extractor().get_numb_interactions() print("Number of items\t {}, Number of users\t {}".format(numItems, numUsers)) print("Max ID items\t {}, Max Id users\t {}\n".format(max(itemList_unique), max(userList_unique))) print("Average interactions per user {:.2f}".format(numberInteractions / numUsers)) print("Average interactions per item {:.2f}\n".format(numberInteractions / numItems)) print("Sparsity {:.2f} %".format((1 - float(numberInteractions) / (numItems * numUsers)) * 100)) URM_all = Extractor().get_train(True) URM_all.tocsr() itemPopularity = (URM_all > 0).sum(axis=0) itemPopularity = np.array(itemPopularity).squeeze() pyplot.plot(itemPopularity, 'ro') pyplot.ylabel('Num Interactions ') pyplot.xlabel('Item Index') pyplot.show() itemPopularity = np.sort(itemPopularity) pyplot.plot(itemPopularity, 'ro') pyplot.ylabel('Num Interactions ') pyplot.xlabel('Item Index') pyplot.show() userActivity = (URM_all > 0).sum(axis=1) userActivity = np.array(userActivity).squeeze() userActivity = np.sort(userActivity) pyplot.plot(userActivity, 'ro') pyplot.ylabel('Num Interactions ') pyplot.xlabel('User Index') pyplot.show()
def __init__(self, cbfknn=True, icfknn=True, ucfknn=True, slim_bpr=True, pure_svd=True, als=True, cfw=True, p3a=True, rp3b=True, slim_en=True): """ Initialization of the generic runner in which we decide whether or not use an algorithm """ self.cbfknn = cbfknn self.icfknn = icfknn self.ucfknn = ucfknn self.slim_bpr = slim_bpr self.pure_svd = pure_svd self.als = als self.cfw = cfw self.p3a = p3a self.rp3b = rp3b self.slim_en = slim_en self.is_test = None self.writer = Writer self.extractor = Extractor() self.result_dict = None self.urm_train = None self.urm_validation = None self.icm = None self.p_cbfknn = None self.p_icfknn = None self.p_ucfknn = None self.p_slimbpr = None self.p_puresvd = None self.p_als = None self.p_cfw = None self.p_p3a = None self.p_rp3b = None self.p_slimen = None self.target_users = [] self.results = [] if self.cbfknn: self.p_cbfknn = WeightConstants.CBFKNN if self.icfknn: self.p_icfknn = WeightConstants.ICFKNN if self.ucfknn: self.p_ucfknn = WeightConstants.UCFKNN if self.slim_bpr: self.p_slimbpr = WeightConstants.SLIM_BPR if self.pure_svd: self.p_puresvd = WeightConstants.PURE_SVD if self.als: self.p_als = WeightConstants.ALS if self.cfw: self.p_cfw = WeightConstants.CFW if self.p3a: self.p_p3a = WeightConstants.P3A if self.rp3b: self.p_rp3b = WeightConstants.RP3B if self.slim_en: self.p_slimen = WeightConstants.SLIM_ELASTIC_NET self.MAPs = []
class CrossValidationRunner(object): def __init__(self, cbfknn=True, icfknn=True, ucfknn=True, slim_bpr=True, pure_svd=True, als=True, cfw=True, p3a=True, rp3b=True, slim_en=True): """ Initialization of the generic runner in which we decide whether or not use an algorithm """ self.cbfknn = cbfknn self.icfknn = icfknn self.ucfknn = ucfknn self.slim_bpr = slim_bpr self.pure_svd = pure_svd self.als = als self.cfw = cfw self.p3a = p3a self.rp3b = rp3b self.slim_en = slim_en self.is_test = None self.writer = Writer self.extractor = Extractor() self.result_dict = None self.urm_train = None self.urm_validation = None self.icm = None self.p_cbfknn = None self.p_icfknn = None self.p_ucfknn = None self.p_slimbpr = None self.p_puresvd = None self.p_als = None self.p_cfw = None self.p_p3a = None self.p_rp3b = None self.p_slimen = None self.target_users = [] self.results = [] if self.cbfknn: self.p_cbfknn = WeightConstants.CBFKNN if self.icfknn: self.p_icfknn = WeightConstants.ICFKNN if self.ucfknn: self.p_ucfknn = WeightConstants.UCFKNN if self.slim_bpr: self.p_slimbpr = WeightConstants.SLIM_BPR if self.pure_svd: self.p_puresvd = WeightConstants.PURE_SVD if self.als: self.p_als = WeightConstants.ALS if self.cfw: self.p_cfw = WeightConstants.CFW if self.p3a: self.p_p3a = WeightConstants.P3A if self.rp3b: self.p_rp3b = WeightConstants.RP3B if self.slim_en: self.p_slimen = WeightConstants.SLIM_ELASTIC_NET self.MAPs = [] def run(self, is_test): """ From here we start each algorithm. :param is_test: specifies if we want to write a report or a submission """ self.is_test = is_test self.icm = self.extractor.get_icm_all() if self.is_test: # CREATION OF THE VALIDATIONS FOR EACH PART OF THE TRAIN vals = [] urms = [] target_profiles = [] for i in range(1, 5): urm_to_predict = self.extractor.get_single_urm(i) matrices = loo.split_train_leave_k_out_user_wise( urm_to_predict, 1, False, True) target_users_profile = matrices[0] target_profiles.append(target_users_profile) val = matrices[1] vals.append(val) urm = self.extractor.get_others_urm_vstack(i) urms.append(urm) if self.icfknn: self.p_icfknn = ParametersTuning.ICFKNN_BEST if self.cbfknn: self.p_cbfknn = ParametersTuning.CBFKNN_BEST if self.rp3b: self.p_rp3b = ParametersTuning.RP3B_BEST if self.slim_en: self.p_slimen = ParametersTuning.SLIM_ELASTIC_NET_BEST # TUNING WITH THE DIFFERENT PARAMS for params in ParametersTuning.UCFKNN: if self.ucfknn: self.p_ucfknn = params self.write_report() # URM splitted in 4 smaller URMs for cross-validation for i in range(0, 4): self.urm_validation = vals[i].copy() self.urm_train = urms[i].copy() self.target_users = self.extractor.get_target_users_of_specific_part( i + 1) self.evaluate(i + 1, target_profiles[i]) self.output_average_MAP() self.output_best_params() else: self.p_cbfknn = ParametersTuning.CBFKNN_BEST self.p_icfknn = ParametersTuning.ICFKNN_BEST self.p_slimen = ParametersTuning.SLIM_ELASTIC_NET_BEST self.p_rp3b = ParametersTuning.RP3B_BEST users = self.extractor.get_target_users_of_recs() self.urm_train = self.extractor.get_urm_all() self.writer.write_header(self.writer, sub_counter=submission_counter) self.write_submission(users) def write_report(self): """ This method is useful to write the report, selecting only chosen algorithms """ now = datetime.now() date = datetime.fromtimestamp(datetime.timestamp(now)) self.writer.write_report(self.writer, "--------------------------------------", report_counter) self.writer.write_report(self.writer, "--------------------------------------\n", report_counter) self.writer.write_report(self.writer, "REPORT " + str(date), report_counter) self.writer.write_report(self.writer, "----- Cross-validation procedure -----\n", report_counter) self.writer.write_report(self.writer, "Fixed parameters", report_counter) self.writer.write_report(self.writer, "--------------------------------------", report_counter) if self.cbfknn: self.writer.write_report(self.writer, "CBFKNN: " + str(self.p_cbfknn), report_counter) if self.icfknn: self.writer.write_report(self.writer, "ICFKNN: " + str(self.p_icfknn), report_counter) if self.ucfknn: self.writer.write_report(self.writer, "UCFKNN: " + str(self.p_ucfknn), report_counter) if self.slim_bpr: self.writer.write_report(self.writer, "SLIM_BPR: " + str(self.p_slimbpr), report_counter) if self.pure_svd: self.writer.write_report(self.writer, "PURE_SVD: " + str(self.p_puresvd), report_counter) if self.als: self.writer.write_report(self.writer, "ALS: " + str(self.p_als), report_counter) if self.cfw: self.writer.write_report(self.writer, "CFW: " + str(self.p_cfw), report_counter) if self.p3a: self.writer.write_report(self.writer, "P3A: " + str(self.p_p3a), report_counter) if self.rp3b: self.writer.write_report(self.writer, "RP3B: " + str(self.p_rp3b), report_counter) if self.slim_en: self.writer.write_report(self.writer, "SLIM_ELASTIC_NET: " + str(self.p_slimen), report_counter) self.writer.write_report(self.writer, "--------------------------------------\n", report_counter) def write_submission(self, users): """ This method is used to write the submission, selecting only chosen algorithms :return: """ recommender = WeightedHybrid(self.urm_train, self.icm, self.p_icfknn, self.p_ucfknn, self.p_cbfknn, self.p_slimbpr, self.p_puresvd, self.p_als, self.p_cfw, self.p_p3a, self.p_rp3b, self.p_slimen, WeightConstants.NO_WEIGHTS[0], seen_items=self.urm_train) recommender.fit() from tqdm import tqdm for user_id in tqdm(users): recs = recommender.recommend(user_id, at=20) self.writer.write(self.writer, user_id, recs, sub_counter=submission_counter) print("Submission file written") def evaluate(self, index: int, target_users_profile): """ Method used for the validation and the calculation of the weights """ generated_weights = [] self.writer.write_report(self.writer, "VALIDATION " + str(index), report_counter) for weight in self.get_test_weights(add_random=False): generated_weights.append(weight) print("--------------------------------------") recommender = WeightedHybrid(self.urm_train, self.icm, self.p_icfknn, self.p_ucfknn, self.p_cbfknn, self.p_slimbpr, self.p_puresvd, self.p_als, self.p_cfw, self.p_p3a, self.p_rp3b, self.p_slimen, weight, seen_items=target_users_profile) recommender.fit() result_dict = evaluate_algorithm_crossvalidation( self.urm_validation, recommender, self.target_users) self.results.append(float(result_dict["MAP"])) del recommender # self.writer.write_report(self.writer, str(weight), report_counter) self.writer.write_report(self.writer, str(result_dict), report_counter) self.writer.write_report(self.writer, "--------------------------------------", report_counter) # Retriving correct weight # results.sort() # weight = generated_weights[int(self.results.index(max(self.results)))] def get_test_weights(self, add_random=False): if not add_random: return WeightConstants.NO_WEIGHTS else: new_weights = [] for weight in WeightConstants.IS_TEST_WEIGHTS: new_weights.append(weight) for i in range(0, 5): new_obj = weight.copy() new_obj["icfknn"] += round( random.uniform(-min(0.5, weight["icfknn"]), 0.5), 2) new_obj["ucfknn"] += round( random.uniform(-min(0.5, weight["ucfknn"]), 0.5), 2) new_obj["cbfknn"] += round( random.uniform(-min(0.5, weight["cbfknn"]), 0.5), 2) new_obj["slimbpr"] += round( random.uniform(-min(0.5, weight["slimbpr"]), 0.5), 2) new_obj["puresvd"] += round( random.uniform(-min(0.5, weight["puresvd"]), 0.5), 2) new_obj["als"] += round( random.uniform(-min(0.5, weight["als"]), 0.5), 2) new_obj["cfw"] += round( random.uniform(-min(0.5, weight["cfw"]), 0.5), 2) new_obj["p3a"] += round( random.uniform(-min(0.5, weight["p3a"]), 0.5), 2) new_obj["rp3b"] += round( random.uniform(-min(0.5, weight["rp3b"]), 0.5), 2) new_obj["slimen"] += round( random.uniform(-min(0.5, weight["slimen"]), 0.5), 2) new_weights.append(new_obj) return new_weights def output_average_MAP(self): average_MAP = 0 for res in self.results: average_MAP += res average_MAP /= len(self.results) self.MAPs.append(average_MAP) self.results.clear() self.writer.write_report(self.writer, "--------------------------------------", report_counter) self.writer.write_report(self.writer, "The average MAP is: " + str(average_MAP), report_counter) def output_best_params(self): best_MAP = max(self.MAPs) index = self.MAPs.index(best_MAP) best_params = ParametersTuning.UCFKNN[index] # best_params = ParametersTuning.RP3B[index] self.writer.write_report(self.writer, "--------------------------------------", report_counter) self.writer.write_report( self.writer, "With a MAP of " + str(best_MAP) + " the best parameters are: " + str(best_params), report_counter)
class XGBoost(object): def __init__(self, cutoff, cbfknn=False, icfknn=False, ucfknn=False, slim_bpr=False, pure_svd=False, als=False, cfw=False, p3a=False, rp3b=False, slim_en=False): """ Initialization of the generic runner in which we decide whether or not use an algorithm """ self.cutoff = cutoff self.cbfknn = cbfknn self.icfknn = icfknn self.ucfknn = ucfknn self.slim_bpr = slim_bpr self.pure_svd = pure_svd self.als = als self.cfw = cfw self.p3a = p3a self.rp3b = rp3b self.slim_en = slim_en self.writer = Writer self.extractor = Extractor() self.df_builder = XGBoostDataframe(self.cutoff) self.result_dict = None self.urm_train = None self.urm_validation = None self.icm = self.extractor.get_icm_all() self.p_cbfknn = None self.p_icfknn = None self.p_ucfknn = None self.p_slimbpr = None self.p_puresvd = None self.p_als = None self.p_cfw = None self.p_p3a = None self.p_rp3b = None self.p_slimen = None self.target_users = [] self.results = [] self.df_user_id_col = [] self.df_item_id_col = [] self.df_train = pd.DataFrame self.df_test = pd.DataFrame def run(self, is_test): """ From here we start each algorithm. :param is_test: specifies if we want to write a report or a submission """ if is_test: # CREATION OF THE VALIDATIONS FOR EACH PART OF THE TRAIN vals = [] urms = [] target_profiles = [] for i in range(1, 5): urm_to_predict = self.extractor.get_single_urm(i) matrices = loo.split_train_leave_k_out_user_wise( urm_to_predict, 1, False, True) target_users_profile = matrices[0] target_profiles.append(target_users_profile) val = matrices[1] vals.append(val) urm = self.extractor.get_others_urm_vstack(i) urms.append(urm) if self.icfknn: self.p_icfknn = ParametersTuning.ICFKNN_BEST if self.cbfknn: self.p_cbfknn = ParametersTuning.CBFKNN_BEST if self.rp3b: self.p_rp3b = ParametersTuning.RP3B_BEST if self.slim_en: self.p_slimen = ParametersTuning.SLIM_ELASTIC_NET_BEST # URM splitted in 4 smaller URMs for cross-validation for i in range(0, 4): self.urm_validation = vals[i].copy() self.urm_train = urms[i].copy() self.target_users = self.extractor.get_target_users_of_specific_part( i + 1) # GETTING THE RECOMMENDATIONS FOR THE TRAIN DATAFRAME user_ids, item_ids = self.evaluate(i + 1, target_profiles[i]) self.df_user_id_col.extend(user_ids) self.df_item_id_col.extend(item_ids) # print(self.df_user_id_col[0:100]) # print(self.df_item_id_col[0:100]) # print(len(self.df_user_id_col)) # print(len(self.df_item_id_col)) self.score_ranking() def write_submission(self, users): pass # RE-RANKING AND EVALUATION def evaluate(self, index: int, target_users_profile): """ This method capture the predictions of the CrossValidation running the Hybrid :param index: number of iteration (from 1 to 4) depending on the current sub-URM :param target_users_profile: profile of the users wanted to predict :return: """ weight = { 'icfknn': 1, 'ucfknn': 1, 'cbfknn': 1, 'slimbpr': 1, 'puresvd': 1, 'als': 1, 'cfw': 1, 'p3a': 1, 'rp3b': 1, 'slimen': 1 } recommender = WeightedHybrid(self.urm_train, self.icm, self.p_icfknn, self.p_ucfknn, self.p_cbfknn, self.p_slimbpr, self.p_puresvd, self.p_als, self.p_cfw, self.p_p3a, self.p_rp3b, self.p_slimen, weight, seen_items=target_users_profile) recommender.fit() user_ids = [] item_ids = [] # SELECTING THE BEST RECOMMENDATIONS for n_user in self.target_users: recommendations = recommender.recommend(n_user, at=self.cutoff) user_ids.extend([n_user] * len(recommendations)) item_ids.extend(recommendations) return user_ids, item_ids # RE-RANKING OF THE SCORES WITH XGBOOST def score_ranking(self): """ Preparation of dataframes and use of XGBoost :return: """ # CREATING DATAFRAMES FOR XGBOOST print(">>> Preparing the two DataFrames...") # Train dataframe self.df_train = self.df_builder.build_base_dataframe( users=self.df_user_id_col, items=self.df_item_id_col) self.df_builder.build_whole_dataframe(self.df_train) # Test dataframe self.df_test = self.df_builder.retrieve_test_dataframe() # BUILD TRAIN AND TEST GROUPS train_group = [] test_group = [] train_user_ids = list(self.df_train.loc[:, 'user_id'].values) test_user_ids = list(self.df_test.loc[:, 'user_id'].values) train_group.extend([self.cutoff] * len(set(train_user_ids))) test_group.extend([self.cutoff] * len(set(test_user_ids))) # DROP USELESS COLUMNS OF DF_TRAIN AND DF_TEST train_dropped = self.df_train.drop(labels={'user_id', 'item_id'}, axis=1) test_dropped = self.df_test.drop(labels={'user_id', 'item_id'}, axis=1) print(">>> DataFrames well formed and ready to be used!") # LGBM TO TRAIN FASTER ON GPU # lgbm_group = self.xgb_dataframe.groupby('user_id').size().values # lightGBM_ranker = lgb.LGBMRanker(device='gpu') # lightGBM_ranker.fit(train_dropped, users, lgbm_group) # XGB RANKER AT WORK print(">>> Fitting of the XGB model...") xbg_ranker = xgb.XGBRanker() xbg_ranker.fit(train_dropped, train_user_ids, train_group) print(">>> Fitting completed!") # model = xgb.train(params, # dtrain, # num_round, # verbose_eval=2, # early_stopping_rounds=20) # print(xgb_regressor.predict(X_test)) print(">>> Predicting the scores...") predictions = xbg_ranker.predict(test_dropped) print(">>> DONE") print(predictions[0:100]) # print("user array:" + str(len(self.user_recommendations_user_id))) # print(" prediction array:" + str(len(model.predict()))) def add_top_pop_items(self): # BUILDING POPULARITY ITEMS print("Adding TopPop items feature...") topPop = TopPop(self.urm_train) topPop.fit() topPop_score_list = [] for user_id, item_id in zip(self.user_recommendations_user_id, self.user_recommendations_items): topPop_score = topPop._compute_item_score([user_id])[0, item_id] topPop_score_list.append(topPop_score) self.xgb_dataframe['item_popularity'] = pd.Series( topPop_score_list, index=self.xgb_dataframe.index) print("Addition completed!") def add_user_profile_length(self): # BUILDING USER PROFILE LENGTH print("Adding user profile length feature...") user_profile_len = np.ediff1d(self.urm_train.indptr) user_profile_len_list = [] for user_id in self.user_recommendations_user_id: user_profile_len_list.append(user_profile_len[user_id]) self.xgb_dataframe['user_profile_len'] = pd.Series( user_profile_len_list, index=self.xgb_dataframe.index) print("Addition completed!") def add_item_asset(self): # BUILDING ITEM ASSET print("Adding item asset feature...") icm_asset_df = self.builder.build_icm_asset_dataframe() assets = [] j = 0 for i in range(0, self.icm.shape[0]): if icm_asset_df.iloc[j]['row'] == i: assets.append(icm_asset_df.iloc[j]['data']) j += 1 else: assets.append(0) icm_asset_list = [] for item_id in self.user_recommendations_items: icm_asset_list.append(assets[item_id]) self.xgb_dataframe['item_asset'] = pd.Series( icm_asset_list, index=self.xgb_dataframe.index) print("Addition completed!") def add_item_price(self): # BUILDING ITEM PRICE print("Adding item price feature...") icm_price_df = self.builder.build_icm_price_dataframe() prices = [] j = 0 for i in range(0, self.icm.shape[0]): if icm_price_df.iloc[j]['row'] == i: prices.append(icm_price_df.iloc[j]['data']) j += 1 else: prices.append(0) icm_asset_list = [] for item_id in self.user_recommendations_items: icm_asset_list.append(prices[item_id]) self.xgb_dataframe['item_price'] = pd.Series( icm_asset_list, index=self.xgb_dataframe.index) print("Addition completed!") def add_item_subclass(self): # BUILDING ITEM SUBCLASS print("Adding item subclass feature...") icm_subclass_df = self.builder.build_icm_subclass_dataframe() subclasses = [] j = 0 for i in range(0, self.icm.shape[0]): if icm_subclass_df.iloc[j]['row'] == i: subclasses.append(icm_subclass_df.iloc[j]['col']) j += 1 else: subclasses.append(0) icm_asset_list = [] for item_id in self.user_recommendations_items: icm_asset_list.append(subclasses[item_id]) self.xgb_dataframe['item_subclass'] = pd.Series( icm_asset_list, index=self.xgb_dataframe.index) print("Addition completed!")
def __init__(self, group_length): self.group_length = group_length extractor = Extractor() self.users = extractor.get_target_users_of_recs()
def filter_seen(self, user_id, scores): start_pos = self.URM.indptr[user_id] end_pos = self.URM.indptr[user_id + 1] user_profile = self.URM.indices[start_pos:end_pos] scores[user_profile] = -np.inf return scores if __name__ == '__main__': extractor = Extractor userList = extractor.get_interaction_users(extractor, False) itemList = extractor.get_interaction_items(extractor, False) ratingList = np.ones(Extractor().get_numb_interactions()) URM_all = extractor.get_interaction_matrix(extractor, False) warm_items_mask = np.ediff1d(URM_all.tocsc().indptr) > 0 warm_items = np.arange(URM_all.shape[1])[warm_items_mask] URM_all = URM_all[:, warm_items] warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > 0 warm_users = np.arange(URM_all.shape[0])[warm_users_mask] URM_all = URM_all[warm_users, :] URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.8) recommender = UserCFKNNRecommender(URM_train)
from OwnUtils.Extractor import Extractor from TopPopRecommender import TopPopRecommender from OwnUtils.Writer import Writer import OwnUtils.Evaluator as ev import numpy as np if __name__ == '__main__': # Function to launch all the others SUBMISSION_NUMBER = 1 # TO BE CHANGED MANUALLY field_names = ['playlist_id', 'track_ids'] users = Extractor.get_user_to_make_rec(Extractor) Writer.write_header(Writer, SUBMISSION_NUMBER, field_names) matricies = Extractor.get_train_test_matrix(Extractor) URM_train = matricies[0] URM_test = matricies[1] topPopRecommender_removeSeen = TopPopRecommender() topPopRecommender_removeSeen.fit(URM_train) unique_users = list(set(Extractor.get_interaction_users(Extractor, False))) ev.evaluate_algorithm(URM_test, topPopRecommender_removeSeen, unique_users, at=10) for user_id in users: recs = topPopRecommender_removeSeen.recommend(user_id, at=10) Writer.write(Writer, SUBMISSION_NUMBER, user_id, recs)