def simple_evaluation(self, file_result, file_test): predict = ReadFile(file_result, space_type=self.space_type).return_information() test = ReadFile(file_test, space_type=self.space_type).return_information() rmse = 0 mae = 0 count_comp = 0 for user in test['users']: for item in test['feedback'][user]: try: rui_predict = float(predict['feedback'][user][item]) rui_test = float(test['feedback'][user][item]) rmse += math.pow((rui_predict - rui_test), 2) mae += math.fabs(rui_predict - rui_test) count_comp += 1 except KeyError: pass if count_comp != 0: rmse = math.sqrt(float(rmse) / float(count_comp)) mae = math.sqrt(float(mae) / float(count_comp)) return rmse, mae
def __init__(self, train_file, test_file=None, ranking_file=None, similarity_metric="correlation", neighbors=30, rank_number=10, implicit=False, space_type='\t'): self.train_set = ReadFile( train_file, space_type=space_type).return_information(implicit) self.test_file = test_file self.users = self.train_set['users'] self.items = self.train_set['items'] if self.test_file is not None: self.test_set = ReadFile(test_file).return_information() self.users = sorted( list(self.train_set['users']) + list(self.test_set['users'])) self.items = sorted( list(self.train_set['items']) + list(self.test_set['items'])) self.k = neighbors self.similarity_metric = similarity_metric self.ranking_file = ranking_file self.rank_number = rank_number self.space_type = space_type self.matrix = self.train_set['matrix'] self.ranking = list() self.si_matrix = None
def read_ranking_files(self): for ranking_file in self.list_rank_files: ranking = ReadFile(ranking_file, space_type=self.space_type) rank_interaction, list_interaction = ranking.read_rankings() self.rankings.append(rank_interaction) self.normalization.append( [min(list_interaction), max(list_interaction)])
def treat_interactions(self): for num, interaction_file in enumerate(self.list_train_files): interaction = ReadFile(interaction_file, space_type=self.space_type) interaction.triple_information() self.individual_datasets.append(interaction.triple_dataset) self.final_dataset += interaction.triple_dataset self.dict_item, self.dict_not_item, self.list_users, self.list_items, \ self.dict_index = return_list_info(self.final_dataset) self.list_users = list(self.list_users) self.list_items = list(self.list_items)
def __init__(self, train_file, test_file=None, ranking_file=None, rank_number=10, space_type='\t'): self.train_set = ReadFile(train_file, space_type=space_type).return_information() self.test_file = test_file self.users = self.train_set['users'] self.items = self.train_set['items'] if self.test_file is not None: self.test_set = ReadFile(test_file).return_information() self.users = sorted(list(self.train_set['users']) + list(self.test_set['users'])) self.items = sorted(list(self.train_set['items']) + list(self.test_set['items'])) self.ranking_file = ranking_file self.rank_number = rank_number self.space_type = space_type self.ranking = list()
def divide_dataset(self): tp = ReadFile(self.dataset, space_type=self.space_type) tp.split_dataset() for fold in range(self.n_folds): dict_feedback = list() tp.triple_dataset = list(set(tp.triple_dataset)) random.shuffle(tp.triple_dataset) sp = int((1 - self.test_ratio) * len(tp.triple_dataset)) train = tp.triple_dataset[:sp] test = tp.triple_dataset[sp:] train.sort() test.sort(key=lambda x: x[0]) train_set = list() test_set = list() for i, feedback in enumerate(self.dataset): dict_individual = dict() for triple in train: try: dict_individual.setdefault(triple[0], {}).update({ triple[1]: tp.individual_interaction[i][triple[0]][triple[1]] }) train_set.append([ triple[0], triple[1], tp.individual_interaction[i][triple[0]][triple[1]] ]) except KeyError: pass for triple_test in test: try: test_set.append([ triple_test[0], triple_test[1], tp.individual_interaction[i][triple_test[0]][ triple_test[1]] ]) except KeyError: pass dict_feedback.append(dict_individual) self.dict_feedback_folds[fold] = dict_feedback self.dict_folds[fold] = {'train': train_set, 'test': test_set} if self.dir_folds is not None: WriteFile(self.dir_folds, self.dict_folds, self.space_type).split_dataset(self.dict_feedback_folds, self.dataset)
def divide_dataset(self): self.tp = ReadFile(self.dataset, space_type=self.space_type).return_information() random.shuffle(self.tp['list_feedback']) # Get the number of interactions that each partition should have. partition_size = int(float(self.tp['ni']) / float(self.n_folds)) list_folds = list() last = -1 for p in range(self.n_folds): initial = 1 + last final = (p + 1) * partition_size list_folds.append(self.tp['list_feedback'][initial:final]) last = final for fold in range(self.n_folds): train_set = list() for fold_train in range(self.n_folds): if fold_train != fold: train_set += list_folds[fold_train] train_set.sort() list_folds[fold].sort() self.dict_folds[fold] = {'train': train_set, 'test': list_folds[fold]} if self.dir_folds is not None: WriteFile(self.dir_folds, self.dict_folds, self.space_type).cross_fold_validation()
def execute(self, measures=('Prec@5', 'Prec@10', 'NDCG@5', 'NDCG@10', 'MAP@5', 'MAP@10')): print( "[Case Recommender: Item Recommendation > Item Attribute KNN Algorithm]\n" ) print("training data:: ", len(self.train_set['users']), " users and ", len(self.train_set['items']), " items and ", self.train_set['ni'], " interactions | sparsity ", self.train_set['sparsity']) if self.test_file is not None: test_set = ReadFile( self.test_file, space_type=self.space_type).return_information() print("test data:: ", len(self.test_set['users']), " users and ", len(self.test_set['items']), " items and ", (self.test_set['ni']), " interactions | sparsity ", self.test_set['sparsity']) del test_set if self.similarity_matrix_file is not None: print("training time:: ", timed(self.read_matrix), " sec") else: print("training time:: ", timed(self.compute_similarity), " sec") print("prediction_time:: ", timed(self.predict), " sec\n") if self.test_file is not None: self.evaluate(measures)
def __init__(self, train_file, test_file=None, metadata_file=None, similarity_matrix_file=None, ranking_file=None, neighbors=30, rank_number=10, similarity_metric="correlation", space_type='\t'): UserKNN.__init__(self, train_file, test_file=test_file, ranking_file=ranking_file, neighbors=neighbors, rank_number=rank_number, similarity_metric=similarity_metric, space_type=space_type) if metadata_file is None and similarity_matrix_file is None: print( "This algorithm needs a similarity matrix or a metadata file!") sys.exit(0) if metadata_file is not None: self.metadata = ReadFile( metadata_file, space_type=space_type).read_metadata(self.users) self.matrix = self.metadata['matrix'] self.similarity_matrix_file = similarity_matrix_file
def __init__(self, train_file, test_file, metadata_file=None, similarity_matrix_file=None, prediction_file=None, neighbors=30, similarity_metric="correlation", space_type='\t'): ItemKNN.__init__(self, train_file, test_file, prediction_file=prediction_file, neighbors=neighbors, similarity_metric=similarity_metric, space_type=space_type) if metadata_file is None and similarity_matrix_file is None: print( "This algorithm needs a similarity matrix or a metadata file!") sys.exit(0) if metadata_file is not None: self.metadata = ReadFile( metadata_file, space_type=space_type).read_metadata(self.items) self.matrix = self.metadata['matrix'].T self.similarity_matrix_file = similarity_matrix_file
def __init__(self, train_file, test_file, prediction_file=None, similarity_metric="correlation", neighbors=30, space_type='\t'): self.train_set = ReadFile(train_file, space_type=space_type).return_information() self.test_set = ReadFile(test_file, space_type=space_type).return_information() BaseKNNRecommenders.__init__(self, self.train_set, self.test_set) self.k = neighbors self.similarity_metric = similarity_metric self.prediction_file = prediction_file self.predictions = list() self.su_matrix = None
def simple_evaluation(self, file_result, file_test): """ A simple evaluation method to return the quality of a ranking :param file_result: (file) ranking file to evaluate :param file_test: (file) test file :return: Values of evaluation """ # Verify that the files are valid check_error_file(file_result) check_error_file(file_test) predict = ReadFile(file_result, space_type=self.space_type).return_information() test = ReadFile(file_test, space_type=self.space_type).return_information() return self.default_evaluation(predict, test)
def evaluation_ranking(self, ranking, test_file): ranking_dict = {'du_order': {}} test = ReadFile(test_file, space_type=self.space_type).return_information() for sample in ranking: ranking_dict['du_order'].setdefault(sample[0], list()).append(sample[1]) return self.default_evaluation(ranking_dict, test)
def __init__(self, train_file, test_file, prediction_file=None, factors=10, init_mean=0.1, init_stdev=0.1, space_type='\t'): self.train_set = ReadFile(train_file, space_type=space_type).return_information() self.test_set = ReadFile(test_file, space_type=space_type).return_information() self.prediction_file = prediction_file self.factors = factors self.init_mean = init_mean self.init_stdev = init_stdev self.users = sorted( set(list(self.train_set["users"]) + list(self.test_set["users"]))) self.items = sorted( set(list(self.train_set["items"]) + list(self.test_set["items"]))) self.number_users = len(self.users) self.number_items = len(self.items) self.metadata = None self.number_metadata = None self.map_items = dict() self.map_items_index = dict() self.map_users = dict() self.map_users_index = dict() for i, item in enumerate(self.items): self.map_items.update({item: i}) self.map_items_index.update({i: item}) for u, user in enumerate(self.users): self.map_users.update({user: u}) self.map_users_index.update({u: user}) # internal vars self.x = None self.p = None self.q = None self.w = None self.b = None self.c = None self.last_rmse = 0 self.predictions = list()
def treat_interactions(self): for num, interaction_file in enumerate(self.list_train_files): interaction = ReadFile(interaction_file, space_type=self.space_type) interaction.triple_information() self.individual_datasets.append(interaction.triple_dataset) self.final_dataset += interaction.triple_dataset if num + 1 == len(self.list_train_files): for triple in interaction.triple_dataset: self.dict_item_tag[triple[0]] = self.dict_item_tag.get( triple[0], 0) + 1 self.dict_item_tag[triple[1]] = self.dict_item_tag.get( triple[1], 0) + 1 self.dict_item, self.dict_not_item, self.list_users, self.list_items, \ self.dict_index = return_list_info(self.final_dataset) self.list_users = list(self.list_users) self.list_items = list(self.list_items)
def __init__(self, list_ranks, test_file, write_file=""): self.list_ranks = list_ranks self.test_file = test_file self.write_file = write_file self.dict_ranks = ReadFile(self.list_ranks).ensemble() self.list_users = set() self.final_ranking = dict() # methods self.ensemble() self.write_results()
def all_but_one_evaluation(self, file_result, file_test): """ All-but-one Protocol: Considers only one pair (u, i) from the test set to evaluate the ranking :param file_result: (file) ranking file to evaluate :param file_test: (file) test file :return: Values of evaluation """ # Verify that the files are valid check_error_file(file_result) check_error_file(file_test) predict = ReadFile(file_result, space_type=self.space_type).return_information() test = ReadFile(file_test, space_type=self.space_type).return_information() for user in test['users']: test['du'][user] = [list(test['du'][user])[0]] return self.default_evaluation(predict, test)
def simple_evaluation_item_cold_start(self, file_result, file_test, file_train, min_feedback=10): predict = ReadFile(file_result, space_type=self.space_type).return_information() test = ReadFile(file_test, space_type=self.space_type).return_information() train = ReadFile(file_train, space_type=self.space_type).return_information() new_items = set() for item in train['items']: if len(train['di'][item]) <= min_feedback: new_items.add(item) print(len(new_items)) rmse = 0 mae = 0 count_comp = 0 for user in test['users']: for item in test['feedback'][user]: if item in new_items: try: rui_predict = float(predict['feedback'][user][item]) rui_test = float(test['feedback'][user][item]) rmse += math.pow((rui_predict - rui_test), 2) mae += math.fabs(rui_predict - rui_test) count_comp += 1 except KeyError: pass if count_comp != 0: rmse = math.sqrt(float(rmse) / float(count_comp)) mae = math.sqrt(float(mae) / float(count_comp)) return rmse, mae
def read_training_data(self): self.train_info, self.list_users, self.list_items, self.num_events, _ = ReadFile( self.list_train_files).ensemble_test() self.number_users = len(self.list_users) self.number_items = len(self.list_items) # remove self.num_events = 5000 for u, user in enumerate(self.list_users): self.map_user[user] = u for i, item in enumerate(self.list_items): self.map_item[item] = i for r in range(len(self.list_train_files)): p, q, bias, beta = self._create_factors() self.rf[r] = {"p": p, "q": q, "bias": bias, "beta": beta}
def __init__(self, training_file, k_row=5, l_col=5, density_low=0.008): """ :param training_file: (string:: file) :param k_row: (int) number of clusters generated by k-means in rows :param l_col: (int) number of clusters generated by k-means in rows :param density_low: (float) threshold to change the density matrix values """ self.training_set = ReadFile(training_file).return_information( implicit=True) self.k_row = k_row self.l_col = l_col self.density_low = density_low self.list_row = [list() for _ in range(self.k_row)] self.list_col = [list() for _ in range(self.l_col)] self.count_total, self.count_ones = list(), list() self.density = None self.delta_entropy = list()
def __init__(self, train_file, test_file, metadata_file, prediction_file=None, steps=30, learn_rate=0.01, delta=0.015, factors=10, init_mean=0.1, init_stdev=0.1, alpha=0.001, batch=False, n2=10, learn_rate2=0.01, delta2=0.015, space_type='\t'): BaseNSVD1.__init__(self, train_file, test_file, prediction_file, factors, init_mean, init_stdev, space_type) self.metadata = ReadFile( metadata_file, space_type=space_type).read_metadata(self.items) self.number_metadata = len(self.metadata["metadata"]) self.batch = batch self.steps = steps self.learn_rate = learn_rate self.delta = delta self.alpha = alpha self.n2 = n2 self.learn_rate2 = learn_rate2 self.delta2 = delta2 # Internal self.x = self.metadata['matrix'] self.non_zero_x = list() self.d = list() for i in range(self.number_items): self.non_zero_x.append(list(np.where(self.x[i] != 0)[0])) with np.errstate(divide='ignore'): self.d.append(1 / np.dot(self.x[i].T, self.x[i]))
def execute(self, measures=('Prec@5', 'Prec@10', 'NDCG@5', 'NDCG@10', 'MAP@5', 'MAP@10')): # methods print("[Case Recommender: Item Recommendation > BPR MF Algorithm]\n") print("training data:: ", len(self.train_set['users']), " users and ", len(self.train_set['items']), " items and ", self.train_set['ni'], " interactions | sparsity ", self.train_set['sparsity']) if self.test_file is not None: test_set = ReadFile(self.test_file).return_information() print("test data:: ", len(self.test_set['users']), " users and ", len(self.test_set['items']), " items and ", (self.test_set['ni']), " interactions | sparsity ", self.test_set['sparsity']) del test_set self._create_factors() print("training time:: ", timed(self.train_model), " sec") print("prediction_time:: ", timed(self.predict), " sec\n") if self.test_file is not None: self.evaluate(measures)
def read_matrix(self): self.si_matrix = ReadFile(self.similarity_matrix_file).read_matrix()
def __init__(self, train_file, test_file=None, ranking_file=None, factors=10, learn_rate=0.05, num_interactions=30, num_events=None, predict_items_number=10, init_mean=0.1, init_stdev=0.1, reg_u=0.0025, reg_i=0.0025, reg_j=0.00025, reg_bias=0, use_loss=True, rank_number=10, space_type='\t'): # external vars self.train_set = ReadFile(train_file, space_type=space_type).return_information() self.test_file = test_file self.ranking_file = ranking_file self.factors = factors self.learn_rate = learn_rate self.predict_items_number = predict_items_number self.init_mean = init_mean self.init_stdev = init_stdev self.num_interactions = num_interactions self.reg_bias = reg_bias self.reg_u = reg_u self.reg_i = reg_i self.reg_j = reg_j self.use_loss = use_loss self.rank_number = rank_number self.train_set["users"] = list(self.train_set['users']) self.train_set["items"] = list(self.train_set['items']) self.users = list(self.train_set['users']) self.items = list(self.train_set['items']) if self.test_file is not None: self.test_set = ReadFile(test_file).return_information() self.test_set['users'] = list(self.test_set['users']) self.test_set['items'] = list(self.test_set['items']) self.users = sorted(self.train_set['users'] + self.test_set['users']) self.items = sorted(self.train_set['items'] + self.test_set['items']) if num_events is None: self.num_events = self.train_set['ni'] else: self.num_events = num_events # internal vars self.loss = None self.loss_sample = list() self.ranking = list() self.map_items = dict() self.map_items_index = dict() self.map_users = dict() self.map_users_index = dict() for i, item in enumerate(self.items): self.map_items.update({item: i}) self.map_items_index.update({i: item}) for u, user in enumerate(self.users): self.map_users.update({user: u}) self.map_users_index.update({u: user})
def __init__(self, train_file, test_file, prediction_file=None, steps=30, learn_rate=0.01, delta=0.015, factors=10, init_mean=0.1, init_stdev=0.1, baseline=False, bias_learn_rate=0.005, delta_bias=0.002, random_seed=0): self.train_set = ReadFile(train_file).return_information() self.test_set = ReadFile(test_file).return_information() self.prediction_file = prediction_file self.steps = steps self.learn_rate = learn_rate self.delta = delta self.factors = factors self.init_mean = init_mean self.init_stdev = init_stdev self.baseline = baseline self.predictions = list() self.map_items = dict() self.map_items_index = dict() self.map_users = dict() self.map_users_index = dict() self.bias_learn_rate = bias_learn_rate self.delta_bias = delta_bias if random_seed != 0: np.random.seed(1) self.p = None self.q = None self.bu = None self.bi = None self.users = sorted( set(list(self.train_set['users']) + list(self.test_set['users']))) self.items = sorted( set(list(self.train_set['items']) + list(self.test_set['items']))) for i, item in enumerate(self.items): self.map_items.update({item: i}) self.map_items_index.update({i: item}) for u, user in enumerate(self.users): self.map_users.update({user: u}) self.map_users_index.update({u: user}) list_feedback = list() self.dict_index = dict() for user, item, feedback in self.train_set['list_feedback']: list_feedback.append( (self.map_users[user], self.map_items[item], feedback)) self.dict_index.setdefault(self.map_users[user], []).append(self.map_items[item]) self.train_set['list_feedback'] = list_feedback self._create_factors()
def read_rankings(self): self.rankings_info, _, _, _, self.ir = ReadFile( self.list_rankings_files).ensemble_test()