def xls_to_csv(self): num_user = 0 x = xlrd.open_workbook('../data/jester-data-1/jester-data-1.xls') x1 = x.sheet_by_name('jester-data-1-new') list_item_seem_by_user_test = [] list_item_seem_by_user_train = [] for rownum in xrange(x1.nrows): #To determine the total rows. self.check = 0 for idx, val in enumerate(x1.row_values(rownum)): if idx == 0 and val >= 50: break # Lấy so user tu idx = 1 va co danh gia (!=99) if idx != 0 and val != 99 and num_user <= 50: list_item_seem_by_user_train.append([rownum, idx, val]) self.check = 1 elif idx != 0 and val != 99: list_item_seem_by_user_test.append([rownum, idx, val]) if (self.check == 1): num_user += 1 WriteFile(out_test_file, list_item_seem_by_user_test).write() WriteFile(out_train_file, list_item_seem_by_user_train).write()
def write_files(self, trained_model): fold = 0 for train_index, test_index in trained_model: if self.dir_folds is not None: train_file = self.dir_folds + str(fold) + '/train.dat' test_file = self.dir_folds + str(fold) + '/test.dat' df_train = self.df.iloc[train_index] df_test = self.df.iloc[test_index] WriteFile(train_file, sep=self.sep_write, mode=self.write_mode ).write_with_pandas(df_train.sort_values(by=[0, 1])) WriteFile(test_file, sep=self.sep_write, mode=self.write_mode ).write_with_pandas(df_test.sort_values(by=[0, 1])) fold += 1
def write_ranking(self): """ Method to write final ranking """ if self.output_file is not None: WriteFile(self.output_file, data=self.ranking, sep=self.sep).write()
def write_predictions(self): """ Method to write final ranking """ if self.output_file is not None: WriteFile(self.output_file, data=self.predictions, sep=self.sep).write()
def recommendation_step(self): for user in self.test_set['users']: user_id = self.user_to_user_id[user] bu, hu = mean_confidence_interval(list( self.train_set['feedback'][user].values()), confidence=.95) for item in self.test_set['items_seen_by_user'][user]: cluster = self.father_of[user_id] ''' mi^k - > media das notas do item em um subconjunto k mu -> media das notas do user u * utilizar o h -> a diferenca entra a media e a borda do intervalo (Soh para subir a arvore?) rui = (wi * mi^k + wu * mu) / (wi + wu) ''' bi = 0 last_h = float('inf') while True: if cluster is None: break if self.cluster_item_interval[cluster].get(item, -1) == -1: cluster = self.father_of[cluster] else: new_h = self.cluster_item_interval[cluster][item][1] if np.isnan(new_h) or new_h == 0: bi = self.cluster_item_interval[cluster][item][0] cluster = self.father_of[cluster] elif new_h < last_h: last_h = new_h bi = self.cluster_item_interval[cluster][item][0] cluster = self.father_of[cluster] else: cluster = self.father_of[cluster] if bi == 0: rui = bu else: rui = .5 * bu + .5 * bi self.predictions.append((user, item, rui)) self.predictions = sorted(self.predictions, key=lambda x: x[1]) if self.output_file is not None: WriteFile(self.output_file, data=self.predictions, sep=self.sep).write()
def generate_groups(self): fold_for_sets = self.dir_name + '/gb_train_' + str(self.parser) + '/' if not os.path.exists(fold_for_sets): os.mkdir(fold_for_sets) train_tuple = self.run_kmedoids() self.k_groups = len(train_tuple) for f in range(len(train_tuple)): train_file_name = fold_for_sets + 'train_%d.dat' % f WriteFile(train_file_name, data=train_tuple[f], sep=self.sep).write() self.gb_train_files.append(train_file_name) del self.train_set_list
def case_rec_evaluation(sess, args, model, data, ripple_set, batch_size): predictions_output_filepath = '../data/' + args.dataset + '/ripplenet_preds.dat' test_output_filepath = '../data/' + args.dataset + '/ripplenet_tests.dat' i_map = load_dict('../data/' + args.dataset + '/i_map.txt') u_map = load_dict('../data/' + args.dataset + '/i_map.txt') start = 0 print_preds = [] while start < data.shape[0]: feed_dict = get_feed_dict(args, model, data, ripple_set, start, start + batch_size) labels, scores = sess.run([model.labels, model.scores_normalized], feed_dict) print('len_scores:%d\tlen_items:%s'% (len(scores),len(feed_dict[model.items]))) #for u, u_scores in enumerate(scores): # for i, score in enumerate(u_scores): # print_preds.append((u_map[start+u], i_map[i], score)) start += batch_size WriteFile(predictions_output_filepath, data=print_preds, sep='\t').write() print_tests = [] for u, u_data in enumerate(data): for i, score in enumerate(u_data): print_tests.append((u_map[start+u], i_map[i], score)) WriteFile(test_output_filepath, data=print_tests, sep='\t').write() # Using CaseRecommender ReadFile class to read test_set from file eval_data = ReadFile(input_file=test_output_filepath).read() predictions_data = ReadFile(input_file=predictions_output_filepath).read() # Creating CaseRecommender evaluator with item-recommendation parameters evaluator = ItemRecommendationEvaluation(n_ranks=[10]) # Getting evaluation item_rec_metrics = evaluator.evaluate(predictions_data['feedback'], eval_data) print ('\nItem Recommendation Metrics:\n', item_rec_metrics) return item_rec_metrics
def case_rec_evaluator(test_file, predictions_file, top_score_dict): print_list = [] for user, item_list in top_score_dict.items(): for item in item_list: score = 1.0 / (item_list.index(item) + 1) print_list.append((int(user[1:]), int(item[1:]), float(score))) WriteFile(predictions_file, data=print_list, sep='\t').write() # Using CaseRecommender ReadFile class to read test_set from file eval_data = ReadFile(input_file=test_file).read() predictions_data = ReadFile(input_file=predictions_file).read() # Creating CaseRecommender evaluator with item-recommendation parameters evaluator = ItemRecommendationEvaluation(n_ranks=[10]) # Getting evaluation item_rec_metrics = evaluator.evaluate(predictions_data['feedback'], eval_data) print ('\nItem Recommendation Metrics:\n', item_rec_metrics) return item_rec_metrics
def case_rec_evaluation(sess, model, users_to_test, Ks, drop_flag=False, batch_test_flag=False): batch_test_flag=False ### Added: preds_output_filepath = '../Data/ml1m-sun2kgat/kgat_pred.txt' test_output_filepath = '../Data/ml1m-sun2kgat/case_rec_test.txt' ### Added- result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)), 'hit_ratio': np.zeros(len(Ks)), 'auc': 0.} ### Removed: ### pool = multiprocessing.Pool(cores) ### Removed- if args.model_type in ['ripple']: u_batch_size = BATCH_SIZE i_batch_size = BATCH_SIZE // 20 elif args.model_type in ['fm', 'nfm']: u_batch_size = BATCH_SIZE i_batch_size = BATCH_SIZE else: u_batch_size = BATCH_SIZE * 2 i_batch_size = BATCH_SIZE test_users = users_to_test n_test_users = len(test_users) n_user_batchs = n_test_users // u_batch_size + 1 count = 0 print_preds = [] for u_batch_id in range(n_user_batchs): start = u_batch_id * u_batch_size end = (u_batch_id + 1) * u_batch_size user_batch = test_users[start: end] # if batch_test_flag: # n_item_batchs = ITEM_NUM // i_batch_size + 1 # rate_batch = np.zeros(shape=(len(user_batch), ITEM_NUM)) # # i_count = 0 # for i_batch_id in range(n_item_batchs): # i_start = i_batch_id * i_batch_size # i_end = min((i_batch_id + 1) * i_batch_size, ITEM_NUM) # # item_batch = range(i_start, i_end) # # feed_dict = data_generator.generate_test_feed_dict(model=model, # user_batch=user_batch, # item_batch=item_batch, # drop_flag=drop_flag) # i_rate_batch = model.eval(sess, feed_dict=feed_dict) # i_rate_batch = i_rate_batch.reshape((-1, len(item_batch))) # # rate_batch[:, i_start: i_end] = i_rate_batch # i_count += i_rate_batch.shape[1] # # assert i_count == ITEM_NUM # else: item_batch = range(ITEM_NUM) feed_dict = data_generator.generate_test_feed_dict(model=model, user_batch=user_batch, item_batch=item_batch, drop_flag=drop_flag) rate_batch = model.eval(sess, feed_dict=feed_dict) rate_batch = rate_batch.reshape((-1, len(item_batch))) user_batch_rating_uid = zip(rate_batch, user_batch) ### Removed: from function: test in utility/batch_test.py ## batch_result = pool.map(test_one_user, user_batch_rating_uid) ### Removed- ### Added: from function test_one_user in utility/batch_test.py: for rating, u in user_batch_rating_uid: try: training_items = data_generator.train_user_dict[u] except Exception: training_items = [] all_items = set(range(data_generator.n_items)) test_items = list(all_items - set(training_items)) item_score = {} for i in test_items: item_score[i] = rating[i] K_max = max(Ks) K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get) for i in K_max_item_score: score = item_score[i] print_preds.append((u, i, score)) WriteFile(preds_output_filepath, data=print_preds, sep='\t').write() #for rating, u in user_batch_rating_uid: # #user u's items in the test set # user_pos_test = data_generator.test_user_dict[u] # for i in user_pos_test: # print_tests.append((u, i)) # #WriteFile(test_output_filepath, data=print_tests, sep='\t', as_binary=True).write() ### Added- ### Removed: ### count += len(batch_result) ###for re in batch_result: result['precision'] += ### re['precision']/n_test_users result['recall'] += ### re['recall']/n_test_users result['ndcg'] += re['ndcg']/n_test_users ### result['hit_ratio'] += re['hit_ratio']/n_test_users result['auc'] += ### re['auc']/n_test_users ###assert count == n_test_users ### pool.close() ### Removed: # Using CaseRecommender ReadFile class to read test_set from file eval_data = ReadFile(input_file=test_output_filepath, as_binary=True).read() predictions_data = ReadFile(input_file=preds_output_filepath).read() # Creating CaseRecommender evaluator with item-recommendation parameters evaluator = ItemRecommendationEvaluation(n_ranks=[10]) # Getting evaluation item_rec_metrics = evaluator.evaluate(predictions_data['feedback'], eval_data) print ('\nItem Recommendation Metrics:\n', item_rec_metrics) return item_rec_metrics
def case_rec_evaluateRec(FLAGS, model, eval_iter, eval_dict, all_dicts, i_map, logger, i, eval_descending=True, is_report=False): # Evaluate total_batches = len(eval_iter) # processing bar pbar = tqdm(total=total_batches) pbar.set_description("Run Eval") all_i_var = None if FLAGS.share_embeddings: all_i_ids = [i_map[i] for i in range(len(i_map))] all_i_var = to_gpu(V(torch.LongTensor(all_i_ids))) model.eval() model.disable_grad() results = [] for u_ids in eval_iter: u_var = to_gpu(V(torch.LongTensor(u_ids))) # batch * item scores = model.evaluateRec(u_var, all_i_ids=all_i_var) preds = zip(u_ids, scores.data.cpu().numpy()) results.extend( evalRecProcess(list(preds), eval_dict, all_dicts=all_dicts, descending=eval_descending, num_processes=FLAGS.num_processes, topn=FLAGS.topn, queue_limit=FLAGS.max_queue)) pbar.update(1) pbar.close() predictions = [result[5] for result in results ] # [(pred[0], top_ids, gold), ...], gold is test print("Saving predictions. Size: {}.".format(str(len(predictions)))) predictions_output_filepath = os.path.join( FLAGS.log_path, FLAGS.experiment_name + '_pred.dat') print_list = [] for triple in predictions: u_id = triple[0] top_ids = triple[1] #gold = triple[2] for i_id in top_ids: score = 1.0 / (top_ids.index(i_id) + 1) print_list.append((u_id, i_id, score)) WriteFile(predictions_output_filepath, data=print_list, sep='\t').write() # Using CaseRecommender ReadFile class to read test_set from file dataset_path = os.path.join(FLAGS.data_path, FLAGS.dataset) eval_files = FLAGS.rec_test_files.split(':') test_path = os.path.join(dataset_path, eval_files[i]) eval_data = ReadFile(input_file=test_path).read() predictions_data = ReadFile(input_file=predictions_output_filepath).read() print("Reading predictions. Size: {}.".format( str(len(predictions_data['feedback'])))) # Creating CaseRecommender evaluator with item-recommendation parameters evaluator = ItemRecommendationEvaluation(n_ranks=[10]) item_rec_metrics = evaluator.evaluate(predictions_data['feedback'], eval_data) print("From CaseRecommender evaluator: {}.".format(str(item_rec_metrics))) logger.info("From CaseRecommender evaluator: {}.".format( str(item_rec_metrics))) # Creating kg-summ-rec evaluator with diversity parameters dataset_name = os.path.basename( os.path.dirname(os.path.dirname(FLAGS.log_path))) tags = dataset_name.split('_') if tags[0] == 'ml-sun': evaluator2 = DiversityEvaluation(n_ranks=[10]) dataset_path = os.path.normpath(FLAGS.data_path + os.sep + os.pardir) #tags = dataset_name.split('-') #if len(tags) > 2: # mode = dataset_name.split('-')[2] # ratio = dataset_name.split('-')[4] #else: # mode = 'sv' # ratio = '100' dataset_path = os.path.normpath(FLAGS.data_path + os.sep + os.pardir + os.sep + os.pardir + os.sep + tags[0] + '_' + tags[1] + '_' + 'oKG') mode = 'sv' ratio = '100' i2genre_map = read_i2genre_map(dataset_path, mode, ratio) diversity_metrics = evaluator2.evaluate(predictions_data['feedback'], eval_data, i2genre_map) print("From kg-summ-rec diversity evaluator: {}.".format( str(diversity_metrics))) logger.info("From kg-summ-rec diversity evaluator: {}.".format( str(diversity_metrics))) model.enable_grad() return item_rec_metrics
def export_data(self): self.processData() WriteFile(self.outTestFile, self.test_data).write() WriteFile(self.outTrainFile, self.train_data).write()