def __init__(self): super(MF, self).__init__() self.config = ConfigX() self.rg = RatingGetter() # loading raing data # self.init_model() self.iter_rmse = [] self.iter_mae = [] pass
def read_data(self,k): print("[%s] %s starting ... fold_round = %sth" % ( self.config.dataset_name, self.__class__, k, # fold_num )) # cv 数据 统计 cv_data_path = self.config.rating_cv_path + self.config.dataset_name + "-" + str(k) + ".csv" print_data_file_stats(cv_data_path) # 初始化 rg self.rg = RatingGetter(k) pass
def __init__(self): super(DataStatis, self).__init__() self.config = ConfigX() self.rg = RatingGetter() # loading raing data self.tg = TrustGetter() self.cold_rating = 0 self.cold_social = 0 self.cold_rating_social = 0 self.cold_rating_warm_social = 0 self.warm_rating_cold_social = 0 self.warm_rating_warm_social = 0
def build_user_item_sim_CF(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): self.rg = RatingGetter(kfold) self.mg = MetaGetter(kfold) from collections import defaultdict # compute item-item similarity matrix print('构建 item-item 相似度矩阵 ...') if load_save_sim: self.item_sim = util.load_data( '../data/sim/%s_08_ii_gemf_cv0.pkl' % self.config.dataset_name) else: # 封装 item 相似度计算 self.item_sim = self.mg.getSimMatrix(jaccard_sim) util.save_data( self.item_sim, '../data/sim/%s_08_ii_gemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of item if load_save_sim: self.item_k_neibor = util.load_data( '../data/neibor/%s_08_ii_%s_neibor_gemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) for item in self.mg.item: matchItems = sorted(self.item_sim[item].items(), key=lambda x: x[1], reverse=True)[:item_near_num] matchItems = matchItems[:item_near_num] self.item_k_neibor[item] = dict(matchItems) util.save_data( self.item_k_neibor, '../data/neibor/%s_08_ii_%s_neibor_gemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) # compute user-user similarity matrix print('构建 user-user 相似度矩阵 ...') if load_save_sim: # if True: self.user_sim = util.load_data( '../data/sim/%s_08_uu_gemf_cv0.pkl' % self.config.dataset_name) else: itemNet = {} for item in self.rg.trainSet_i: if len(self.rg.trainSet_i[item]) > 1: itemNet[item] = self.rg.trainSet_i[item] filteredRatings = defaultdict(list) for item in itemNet: for user in itemNet[item]: if itemNet[item][user] > 0: filteredRatings[user].append(item) self.CUNet = defaultdict(list) for user1 in tqdm(filteredRatings): s1 = set(filteredRatings[user1]) for user2 in filteredRatings: if user1 != user2: s2 = set(filteredRatings[user2]) weight = len(s1.intersection(s2)) if weight > 0: self.CUNet[user1] += [user2] print('Generating random deep walks...') self.walks = [] self.visited = defaultdict(dict) for user in tqdm(self.CUNet): for t in range(self.config.walkCount): path = [str(user)] lastNode = user for i in range(1, self.config.walkLength): nextNode = choice(self.CUNet[lastNode]) count = 0 while (nextNode in self.visited[lastNode]): nextNode = choice(self.CUNet[lastNode]) #break infinite loop count += 1 if count == self.config.walkLength: # 10 break path.append(str(nextNode)) self.visited[user][nextNode] = 1 lastNode = nextNode self.walks.append(path) self.model = w2v.Word2Vec(self.walks, size=self.config.walkDim, window=5, min_count=0, iter=3) self.topKSim = defaultdict(dict) i = 0 for u1 in tqdm(self.CUNet): sims = {} for u2 in self.CUNet: if user1 != user2: if self.user_sim.contains(u1, u2): continue wu1 = self.model[str(u1)] wu2 = self.model[str(u2)] sims[u2] = cosine(wu1, wu2) #若为空咋整 self.user_sim.set(u1, u2, sims[u2]) i += 1 if i % 200 == 0: print('progress:', i, '/', len(self.CUNet)) if not os.path.exists('../data/sim'): os.makedirs('../data/sim') print('../data/sim folder has been established.') util.save_data( self.user_sim, '../data/sim/%s_08_uu_gemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of user if load_save_sim: self.user_k_neibor = util.load_data( '../data/neibor/%s_08_uu_%s_neibor_gemf_cv0.pkl' % (self.config.dataset_name, user_near_num)) for user in self.rg.user: self.topKSim[u1] = sorted(sims.items(), key=lambda d: d[1], reverse=True)[:self.config.topK] self.topKSim[u1] = self.topKSim[u1][:user_near_num] self.user_k_neibor[user] = dict(self.topKSim[u1]) if not os.path.exists('../data/neibor'): os.makedirs('../data/neibor') print('../data/neibor folder has been established.') util.save_data( self.user_k_neibor, '../data/neibor/%s_08_uu_%s_neibor_gemf_cv0.pkl' % (self.config.dataset_name, user_near_num))
def __init__(self): super(GEMF, self).__init__() self.rg = RatingGetter() ex_file = 'yp_trust' self.explict_trust_path = '../data/net/' + ex_file + '.txt' weight = 0.5 # file = '%s_weight_%s' % (self.config.dataset_name, weight) file = 'yp_CUnet_weight' self.implict_trust_path = '../data/net/' + file + '.txt' # file = '%s_CUnet_weight_nnn' % self.config.dataset_name # file = '%s_less_CUnet_weight' % self.config.dataset_name # self.implict_trust_path = '../data/' + file + '.txt' # self.implict_trust_path = '../data/yp_30_39_rating_im_net_new.txt' # ft_3 & db_13 & ca_16 & yp_30_39 # & ca_23 & db_18 ############## 1 ################ # ex_file = '%s_filter_trust_new' % self.config.dataset_name # file = '%s_CUnet_weight_new' % self.config.dataset_name # self.implict_trust_path = '../data/' + file + '.txt' # self.explict_trust_path = '../data/' + ex_file + '.txt' ############## 2 ################ # file = 'ft_3_rating_im_net' # file = 'ft_3_rating_im_net_new' # ft_3 & db_18 & ca_23 & yp_30_39 for new # self.implict_trust_path = '../data/' + file + '.txt' ############## 3 ################ # weight = 0.3 # file = '%s_two_net_with_weight_%s_rewrited' % (self.config.dataset_name, weight) # file = '%s_two_net_with_weight_%s_new_rewrited' % (self.config.dataset_name, weight) # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' ############## 4 ################ # file = '%s_two_net_with_tanh_rewrited' % (self.config.dataset_name) # file = '%s_two_net_with_tanh_new_rewrited' % (self.config.dataset_name) # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' ############## 5 ################ # file = '%s_inter_net' % self.config.dataset_name # file = '%s_union_net' % self.config.dataset_name # file = '%s_union_net_expanded' % self.config.dataset_name # file = '%s_inter_net_new' % self.config.dataset_name # file = '%s_union_net_new' % self.config.dataset_name # file = '%s_union_net_new_expanded' % self.config.dataset_name # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' # parameters for matrix factorization self.config.lr = 0.01 self.config.lambdaP = 0.03 #0.03 self.config.lambdaQ = 0.01 #0.01 self.config.lambdaB = 0.01 #0.01 self.config.temp1 = 0.01 self.config.temp2 = 0.01 self.config.alpha = self.config.temp1 self.config.beta = self.config.temp2 self.config.factor = 10 self.config.isEarlyStopping = True self.config.k_fold_num = 5 # parameters for netwalker self.config.random_state = 0 self.config.number_walks = 30 # the times of random walk 5 self.config.path_length = 20 # the length of random walk 10 self.config.restart_pro = 0.1 # the probability of restarts. self.config.undirected = True self.config.ex_walk_result_path = '../data/ge/' + ex_file + '_social_corpus_filter.txt' self.config.im_walk_result_path = '../data/ge/' + file + '_social_corpus_implict.txt' # parameters for graph embedding self.config.lambdaW = 1 self.config.ex_table_path = '../data/ge/' + ex_file + '_table_filter.pkl' self.config.ex_model_out_path = '../data/ge/' + ex_file + '_result_filter.txt' self.config.im_table_path = '../data/ge/' + file + '_table_implict.pkl' self.config.im_model_out_path = '../data/ge/' + file + '_result_implict.txt' self.config.cbow = 0 self.config.neg = 5 self.config.w2v_lr = 0.01 # 0.01-0.81 self.config.win_size = 10 self.config.min_count = 3 self.config.binary = 0 self.dataSet_u = defaultdict(dict) self.dataSet_i = defaultdict(dict) self.filteredRatings = defaultdict(list) self.CUNet = defaultdict(list) self.walks = [] self.ex_walks = [] self.im_walks = [] # self.visited = defaultdict(dict) self.ex_pos_loss_total = 0 self.ex_neg_loss_total = 0 self.im_pos_loss_total = 0 self.im_neg_loss_total = 0 # cpprint('k is %s' % self.config.near_num) cpprint('implict_trust_path is %s' % self.implict_trust_path) cpprint('explict_trust_path is %s' % self.explict_trust_path) cpprint('lr is %s' % self.config.lr) cpprint('neg is %s' % self.config.neg) cpprint('w2v_lr is %s' % self.config.w2v_lr) cpprint('win_size is %s' % self.config.win_size) cpprint('alpha is %s' % self.config.alpha) cpprint('beta is %s' % self.config.beta) cpprint('lamdbaP is %s' % self.config.lambdaP) cpprint('lambdaQ is %s' % self.config.lambdaQ) cpprint('number_walks is %s' % self.config.number_walks) cpprint('path_length is %s' % self.config.path_length) # cpprint('factor is %s' % self.config.factor) self.init_model()
class GEMF(MF): """ python implementation for GEMF """ def __init__(self): super(GEMF, self).__init__() self.rg = RatingGetter() ex_file = 'yp_trust' self.explict_trust_path = '../data/net/' + ex_file + '.txt' weight = 0.5 # file = '%s_weight_%s' % (self.config.dataset_name, weight) file = 'yp_CUnet_weight' self.implict_trust_path = '../data/net/' + file + '.txt' # file = '%s_CUnet_weight_nnn' % self.config.dataset_name # file = '%s_less_CUnet_weight' % self.config.dataset_name # self.implict_trust_path = '../data/' + file + '.txt' # self.implict_trust_path = '../data/yp_30_39_rating_im_net_new.txt' # ft_3 & db_13 & ca_16 & yp_30_39 # & ca_23 & db_18 ############## 1 ################ # ex_file = '%s_filter_trust_new' % self.config.dataset_name # file = '%s_CUnet_weight_new' % self.config.dataset_name # self.implict_trust_path = '../data/' + file + '.txt' # self.explict_trust_path = '../data/' + ex_file + '.txt' ############## 2 ################ # file = 'ft_3_rating_im_net' # file = 'ft_3_rating_im_net_new' # ft_3 & db_18 & ca_23 & yp_30_39 for new # self.implict_trust_path = '../data/' + file + '.txt' ############## 3 ################ # weight = 0.3 # file = '%s_two_net_with_weight_%s_rewrited' % (self.config.dataset_name, weight) # file = '%s_two_net_with_weight_%s_new_rewrited' % (self.config.dataset_name, weight) # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' ############## 4 ################ # file = '%s_two_net_with_tanh_rewrited' % (self.config.dataset_name) # file = '%s_two_net_with_tanh_new_rewrited' % (self.config.dataset_name) # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' ############## 5 ################ # file = '%s_inter_net' % self.config.dataset_name # file = '%s_union_net' % self.config.dataset_name # file = '%s_union_net_expanded' % self.config.dataset_name # file = '%s_inter_net_new' % self.config.dataset_name # file = '%s_union_net_new' % self.config.dataset_name # file = '%s_union_net_new_expanded' % self.config.dataset_name # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt' # parameters for matrix factorization self.config.lr = 0.01 self.config.lambdaP = 0.03 #0.03 self.config.lambdaQ = 0.01 #0.01 self.config.lambdaB = 0.01 #0.01 self.config.temp1 = 0.01 self.config.temp2 = 0.01 self.config.alpha = self.config.temp1 self.config.beta = self.config.temp2 self.config.factor = 10 self.config.isEarlyStopping = True self.config.k_fold_num = 5 # parameters for netwalker self.config.random_state = 0 self.config.number_walks = 30 # the times of random walk 5 self.config.path_length = 20 # the length of random walk 10 self.config.restart_pro = 0.1 # the probability of restarts. self.config.undirected = True self.config.ex_walk_result_path = '../data/ge/' + ex_file + '_social_corpus_filter.txt' self.config.im_walk_result_path = '../data/ge/' + file + '_social_corpus_implict.txt' # parameters for graph embedding self.config.lambdaW = 1 self.config.ex_table_path = '../data/ge/' + ex_file + '_table_filter.pkl' self.config.ex_model_out_path = '../data/ge/' + ex_file + '_result_filter.txt' self.config.im_table_path = '../data/ge/' + file + '_table_implict.pkl' self.config.im_model_out_path = '../data/ge/' + file + '_result_implict.txt' self.config.cbow = 0 self.config.neg = 5 self.config.w2v_lr = 0.01 # 0.01-0.81 self.config.win_size = 10 self.config.min_count = 3 self.config.binary = 0 self.dataSet_u = defaultdict(dict) self.dataSet_i = defaultdict(dict) self.filteredRatings = defaultdict(list) self.CUNet = defaultdict(list) self.walks = [] self.ex_walks = [] self.im_walks = [] # self.visited = defaultdict(dict) self.ex_pos_loss_total = 0 self.ex_neg_loss_total = 0 self.im_pos_loss_total = 0 self.im_neg_loss_total = 0 # cpprint('k is %s' % self.config.near_num) cpprint('implict_trust_path is %s' % self.implict_trust_path) cpprint('explict_trust_path is %s' % self.explict_trust_path) cpprint('lr is %s' % self.config.lr) cpprint('neg is %s' % self.config.neg) cpprint('w2v_lr is %s' % self.config.w2v_lr) cpprint('win_size is %s' % self.config.win_size) cpprint('alpha is %s' % self.config.alpha) cpprint('beta is %s' % self.config.beta) cpprint('lamdbaP is %s' % self.config.lambdaP) cpprint('lambdaQ is %s' % self.config.lambdaQ) cpprint('number_walks is %s' % self.config.number_walks) cpprint('path_length is %s' % self.config.path_length) # cpprint('factor is %s' % self.config.factor) self.init_model() def init_model(self): super(GEMF, self).init_model() print('starting initialization...') #1、extract user corpus with user's social network - netwalker print('=' * 5 + 'extracting user corpus with users social network' + '=' * 5) # ex_G = netwalker.load_edgelist_without_weight(self.explict_trust_path, undirected=self.config.undirected) # 读取显式用户网络 trust.txt # im_G = netwalker.load_edgelist_without_weight(self.implict_trust_path, undirected=self.config.undirected) # 读取隐式用户网络 CUnet # # ex_weight_dic, ex_G = netwalker.load_edgelist_with_weight(self.explict_trust_path, undirected=self.config.undirected) # 读取显式用户网络 trust.txt # im_weight_dic, im_G = netwalker.load_edgelist_with_weight(self.implict_trust_path, undirected=self.config.undirected) # 读取隐式用户网络 CUnet # self.ex_walks = netwalker.build_deepwalk_corpus(ex_G, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state) # self.im_walks = netwalker.build_deepwalk_corpus(im_G, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state) ########################################################## # ex_weight_dic, ex_G = netwalker.load_edgelist_with_weight(self.explict_trust_path, undirected=self.config.undirected) # 读取显式用户网络 trust.txt # im_weight_dic, im_G = netwalker.load_edgelist_with_weight(self.implict_trust_path, undirected=self.config.undirected) # 读取隐式用户网络 CUnet # self.ex_walks = netwalker.deepwalk_with_alpha(ex_G, ex_weight_dic, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state) # self.im_walks = netwalker.deepwalk_with_alpha(im_G, im_weight_dic, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state) ######################################## ex_weight_dic, ex_G = netwalker.load_edgelist_with_weight( self.explict_trust_path, undirected=self.config.undirected) # 读取显式用户网络 trust.txt im_weight_dic, im_G = netwalker.load_edgelist_with_weight( self.implict_trust_path, undirected=self.config.undirected) # 读取隐式用户网络 CUnet # self.ex_walks = netwalker.deepwalk_without_alpha(ex_G, ex_weight_dic, self.config.number_walks, # self.config.path_length, self.config.restart_pro, self.config.random_state) self.ex_walks = netwalker.deepwalk_without_alpha_for_ex( ex_G, ex_weight_dic, im_weight_dic, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state) self.im_walks = netwalker.deepwalk_without_alpha( im_G, im_weight_dic, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state) # shuffle the walks np.random.shuffle(self.ex_walks) np.random.shuffle(self.im_walks) # cpprint(walks) netwalker.save_walks(self.ex_walks, self.config.ex_walk_result_path) netwalker.save_walks(self.im_walks, self.config.im_walk_result_path) print('=' * 5 + 'generating inverted index...' + '=' * 5) self.inverted_index() # print(self.node_inverted_index) #2、initialize the w and w' in graph embedding print('=' * 5 + 'read social corpus' + '=' * 5) ex_fi = open(self.config.ex_walk_result_path, 'r') # training corpus im_fi = open(self.config.im_walk_result_path, 'r') # training corpus self.ex_social_vocab = Vocab( ex_fi, self.config.min_count) # user node and their index self.im_social_vocab = Vocab( im_fi, self.config.min_count) # user node and their index #social 的用户是否都在ui矩阵中出现,若是子集比较好说,若非子集则需将该用户随机初始化 print('=' * 5 + 'initialize network for word2vec' + '=' * 5) self.reset_index(self.rg, self.im_social_vocab) self.init_net() print('=' * 5 + 'generate the unigram table for word2vec' + '=' * 5) if not os.path.exists( self.config.ex_table_path): # if exists, continue self.ex_table = UnigramTable(self.ex_social_vocab) util.save_data(self.ex_table, self.config.ex_table_path) else: self.ex_table = util.load_data(self.config.ex_table_path) if not os.path.exists(self.config.im_table_path): self.im_table = UnigramTable(self.im_social_vocab) util.save_data(self.im_table, self.config.im_table_path) else: self.im_table = util.load_data(self.config.im_table_path) def inverted_index(self): self.ex_node_inverted_index = defaultdict(set) self.im_node_inverted_index = defaultdict(set) for index, line in enumerate(self.ex_walks): for node in line: self.ex_node_inverted_index[node].add(index) for index, line in enumerate(self.im_walks): for node in line: self.im_node_inverted_index[node].add(index) pass def reset_index( self, rg, vocab ): # rg表示rating抽取出来训练集的user。统计多少个social用户没有在ui中出现,需要将他们初始化然后并到P中去 print('the current user number in ui is ' + str(len(self.rg.user))) not_exists_ui = [] # 记录不在ui中的user list num = 0 mapping = {'<bol>': -1, '<eol>': -2, '<unk>': -3} for user in vocab.vocab_hash: if user != '<bol>' and user != '<eol>' and user != '<unk>' and not int( user) in rg.user: num += 1 not_exists_ui.append(int(user)) # 若社交语料的user不在ui阵里面,则将这个user扩充到训练集中,更新其编号,并且赋值给社交user字典 self.rg.user[int(user)] = len(self.rg.user) # 扩充self.rg.user self.im_social_vocab.vocab_hash[user] = self.rg.user[int( user )] # reset the index of users in social corpus in order to be common in ui elif user != '<bol>' and user != '<eol>' and user != '<unk>': # 若社交语料的user在ui阵里面,则将这个user在ui阵训练集中的编号赋值给社交user字典 self.im_social_vocab.vocab_hash[user] = self.rg.user[int( user)] # reset the index of users in social corpus else: # 处理三个特殊字符的编号 index = mapping[user] self.rg.user[index] = len(self.rg.user) self.im_social_vocab.vocab_hash[user] = self.rg.user[index] print("the number of not exists in ui is " + str(num)) def init_net(self): # a = np.sqrt((self.config.max_val + self.config.min_val) / self.config.factor) self.P = np.random.rand(self.rg.get_train_size()[0], self.config.factor) / ( #跟随机初始化的变量有很大关系 self.config.factor**0.5 ) # the common user latent vetors in MF and GE self.ex_W = np.random.rand(self.rg.get_train_size()[0], self.config.factor) / (self.config.factor** 0.5) self.im_W = np.random.rand(self.rg.get_train_size()[0], self.config.factor) / (self.config.factor** 0.5) self.Bu = np.random.rand(self.rg.get_train_size()[0]) / ( self.config.factor**0.5) # bias value of user self.Bi = np.random.rand( self.rg.get_train_size()[1]) / (self.config.factor**0.5) print('the shape of P is ' + str(self.P.shape)) self.ex_pos_neu1 = np.zeros(self.config.factor) self.ex_neg_neul = np.zeros(self.config.factor) self.im_pos_neu1 = np.zeros(self.config.factor) self.im_neg_neul = np.zeros(self.config.factor) pass def train_model(self): super(GEMF, self).train_model() iteration = 0 while iteration < self.config.maxIter: self.loss = 0 self.ex_pos_loss_total = 0 self.ex_neg_loss_total = 0 self.im_pos_loss_total = 0 self.im_neg_loss_total = 0 for index, line in enumerate(self.rg.trainSet()): user, item, rating = line u = self.rg.user[user] i = self.rg.item[item] error = rating - self.predict(user, item) self.loss += 0.5 * error**2 # 需要最后计算loss吧? p, q = self.P[u], self.Q[i] bu, bi = self.Bu[u], self.Bi[i] # the formulation of w2v if str(user) not in self.ex_node_inverted_index: self.im_pos_loss_total += 0 self.im_neg_loss_total += 0 self.config.alpha = 0 else: self.config.alpha = self.config.temp1 im_walks_list = self.im_node_inverted_index[str(user)] im_wl = len(im_walks_list) if im_wl > 0: rand_num = np.random.randint(low=0, high=len(im_walks_list)) # print(rand_num) im_line_num = list(im_walks_list)[rand_num] im_walks = self.im_walks[im_line_num] im_sent = self.im_social_vocab.indices(['<bol>'] + im_walks + ['<eol>']) for im_sent_pos, im_token in enumerate(im_sent): if im_token != u: continue im_current_win = self.config.win_size # np.random.randint(low=1, high=self.config.win_size+1) im_context_start = max( im_sent_pos - im_current_win, 0) im_context_end = min( im_sent_pos + im_current_win + 1, len(im_sent)) im_context = im_sent[ im_context_start:im_sent_pos] + im_sent[ im_sent_pos + 1:im_context_end] # 取出中心词的上下文context # neu1 = np.mean(np.array([self.W[c] for c in self.context]), axis=0) # 压缩行,对各列求均值,计算上下文行均值得到 h of N+ self.im_pos_neu1 = np.mean(np.array( [self.im_W[c] for c in im_context]), axis=0) self.im_neg_neul = np.mean(np.array([ self.im_W[target] for target in self.im_table.sample(self.config.neg) ]), axis=0) self.im_pos_loss_total += np.log( sigmoid( np.dot(self.P[im_token], self.im_pos_neu1))) self.im_neg_loss_total += np.log( sigmoid(-np.dot(self.P[im_token], self.im_neg_neul))) if self.config.neg > 0: # classifiers = [(token, 1)] + [(target, 0) for target in self.table.sample(self.config.neg)] im_classifiers = [ (im_context_word, 1) for im_context_word in im_context ] + [(im_target, 0) for im_target in self.im_table.sample(self.config.neg)] for im_word, im_label in im_classifiers: if im_label == 1: z_po = np.dot(self.im_pos_neu1, self.P[im_token]) p_po = sigmoid(z_po) f_po = self.config.beta * (im_label - p_po) g_po = f_po * self.P[ im_token] - self.config.lambdaW * self.im_W[ im_word] - (self.im_W[im_word] - self.ex_W[im_word] ) # 负梯度 self.im_W[ im_word] += self.config.w2v_lr * g_po else: z_ne = np.dot(self.im_neg_neul, self.P[im_token]) p_ne = sigmoid(z_ne) f_ne = self.config.beta * (im_label - p_ne) g_ne = f_ne * self.P[ im_token] - self.config.lambdaW * self.im_W[ im_word] - (self.im_W[im_word] - self.ex_W[im_word] ) # 负梯度 self.im_W[ im_word] += self.config.w2v_lr * g_ne if str(user) not in self.ex_node_inverted_index: self.ex_pos_loss_total += 0 self.ex_neg_loss_total += 0 self.config.beta = 0 else: self.config.beta = self.config.temp2 ex_walks_list = self.ex_node_inverted_index[str(user)] ex_wl = len(ex_walks_list) if ex_wl > 0: # for walk_line in walks_list: # np.random.seed(10) rand_num = np.random.randint(low=0, high=len(ex_walks_list)) # print(rand_num) ex_line_num = list(ex_walks_list)[rand_num] ex_walks = self.ex_walks[ex_line_num] ex_sent = self.ex_social_vocab.indices(['<bol>'] + ex_walks + ['<eol>']) # self.pos_total = 0 # self.neg_total = 0 # self.context = {} # self.pos_loss_total = 0 # self.neg_loss_total = 0 for ex_sent_pos, ex_token in enumerate(ex_sent): if ex_token != u: continue ex_current_win = self.config.win_size # np.random.randint(low=1, high=self.config.win_size+1) ex_context_start = max( ex_sent_pos - ex_current_win, 0) ex_context_end = min( ex_sent_pos + ex_current_win + 1, len(ex_sent)) ex_context = ex_sent[ ex_context_start:ex_sent_pos] + ex_sent[ ex_sent_pos + 1:ex_context_end] # 取出中心词的上下文context # neu1 = np.mean(np.array([self.W[c] for c in self.context]), axis=0) # 压缩行,对各列求均值,计算上下文行均值得到 h of N+ self.ex_pos_neu1 = np.mean(np.array( [self.ex_W[c] for c in ex_context]), axis=0) self.ex_neg_neul = np.mean(np.array([ self.ex_W[target] for target in self.ex_table.sample(self.config.neg) ]), axis=0) self.ex_pos_loss_total += np.log( sigmoid( np.dot(self.P[ex_token], self.ex_pos_neu1))) self.ex_neg_loss_total += np.log( sigmoid(-np.dot(self.P[ex_token], self.ex_neg_neul))) if self.config.neg > 0: # classifiers = [(token, 1)] + [(target, 0) for target in self.table.sample(self.config.neg)] ex_classifiers = [(ex_context_word, 1) for ex_context_word in ex_context] \ + [(ex_target, 0) for ex_target in self.ex_table.sample(self.config.neg)] for ex_word, ex_label in ex_classifiers: if ex_label == 1: z_po = np.dot(self.ex_pos_neu1, self.P[ex_token]) p_po = sigmoid(z_po) f_po = self.config.alpha * (ex_label - p_po) g_po = f_po * self.P[ ex_token] - self.config.lambdaW * self.ex_W[ ex_word] + (self.im_W[ex_word] - self.ex_W[ex_word] ) # 负梯度 self.ex_W[ ex_word] += self.config.w2v_lr * g_po else: z_ne = np.dot(self.ex_neg_neul, self.P[ex_token]) p_ne = sigmoid(z_ne) f_ne = self.config.alpha * (ex_label - p_ne) g_ne = f_ne * self.P[ ex_token] - self.config.lambdaW * self.ex_W[ ex_word] + (self.im_W[ex_word] - self.ex_W[ex_word] ) # 负梯度 self.ex_W[ ex_word] += self.config.w2v_lr * g_ne # update latent vectors P, Q, Bu, Bi in MF self.Bu[u] += self.config.lr * (error - self.config.lambdaB * bu) self.Bi[i] += self.config.lr * (error - self.config.lambdaB * bi) self.P[u] += self.config.lr * ( error * q + self.config.alpha * (self.ex_pos_neu1 * (1 - sigmoid(np.dot(self.ex_pos_neu1, self.P[u]))) - self.ex_neg_neul * sigmoid(np.dot(self.ex_neg_neul, self.P[u]))) + self.config.beta * (self.im_pos_neu1 * (1 - sigmoid(np.dot(self.im_pos_neu1, self.P[u]))) - self.im_neg_neul * sigmoid(np.dot(self.im_neg_neul, self.P[u]))) - self.config.lambdaP * p) self.Q[i] += self.config.lr * (error * p - self.config.lambdaQ * q) # self.loss += 0.5 * self.config.lambdaP * (self.P * self.P).sum() + 0.5 * self.config.lambdaQ * ( # self.Q * self.Q).sum() + 0.5 * self.config.lambdaB * ((self.Bu * self.Bu).sum() + (self.Bi * self.Bi).sum()) self.loss += self.config.alpha * (- self.ex_pos_loss_total - self.ex_neg_loss_total) \ + self.config.beta * (- self.im_pos_loss_total - self.im_neg_loss_total) \ + 0.5 * self.config.lambdaP * (self.P * self.P).sum() \ + 0.5 * self.config.lambdaQ * (self.Q * self.Q).sum() \ + 0.5 * self.config.lambdaB * ((self.Bu * self.Bu).sum() + (self.Bi * self.Bi).sum()) \ + 0.5 * self.config.lambdaW * ((self.ex_W * self.ex_W).sum() + (self.im_W * self.im_W).sum()) # + 0.5 * np.linalg.norm(self.im_W - self.ex_W) ** 2 iteration += 1 if self.isConverged(iteration): break # def predict(self, u, i): # if self.rg.containsUser(u) and self.rg.containsItem(i): # u = self.rg.user[u] # i = self.rg.item[i] # return self.P[u].dot(self.Q[i]) + self.rg.globalMean + self.Bi[i] + self.Bu[u] # else: # return self.rg.globalMean # def predict(self, u, i): # if self.rg.containsUser(u) and self.rg.containsItem(i): # u = self.rg.user[u] # i = self.rg.item[i] # return self.P[u].dot(self.Q[i]) + self.rg.globalMean + self.Bi[i] + self.Bu[u] # elif self.rg.containsUser(u) and not self.rg.containsItem(i): # return self.rg.userMeans[u] # elif not self.rg.containsUser(u) and self.rg.containsItem(i): # return self.rg.itemMeans[i] # else: # return self.rg.globalMean def generate_cu_net(self, rg): f = open(self.implict_trust_path, 'w') print('Building collaborative user network...') itemNet = {} for item in self.rg.trainSet_i: # 外层key为item 里层key为user value为评分 if len(self.rg.trainSet_i[item]) > 1: # 如果item被一个以上的user评过分 itemNet[item] = self.rg.trainSet_i[ item] # 把这些item的里层字典赋值给itemNet filteredRatings = defaultdict(list) for item in itemNet: for user in itemNet[item]: if itemNet[item][user] > 0: filteredRatings[user].append( item ) # 若user和item有交互,把 item append给filteredRatings[user] self.CUNet = defaultdict(list) for user1 in filteredRatings: s1 = set(filteredRatings[user1]) # s1是与user1有交互的物品集 for user2 in filteredRatings: if user1 != user2: # 遍历filteredRatings所有user,如果不是同一个user s2 = set(filteredRatings[user2]) # s2是与user2有交互的物品集 weight = len(s1.intersection( s2)) # 把两个不同user的评过分的物品集的交集的物品个数作为uu网络的uu权重 if weight > 0: # self.CUNet[user1] += [user2] # 得到{user1:[user2,......]} f.writelines( str(user1) + ' ' + str(user2) + ' ' + str(weight) + '\n')
def build_user_item_sim_CF(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): self.rg = RatingGetter(kfold) self.mg = MetaGetter(kfold) from collections import defaultdict # compute item-item similarity matrix print('构建 item-item 相似度矩阵 ...') if load_save_sim: self.item_sim = util.load_data( '../data/sim/%s_08_ii_cucmemf_cv0.pkl' % self.config.dataset_name) else: # 封装 item 相似度计算 self.item_sim = self.mg.getSimMatrix(jaccard_sim) util.save_data( self.item_sim, '../data/sim/%s_08_ii_cucmemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of item if load_save_sim: self.item_k_neibor = util.load_data( '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) for item in self.mg.item: matchItems = sorted(self.item_sim[item].items(), key=lambda x: x[1], reverse=True)[:item_near_num] matchItems = matchItems[:item_near_num] self.item_k_neibor[item] = dict(matchItems) util.save_data( self.item_k_neibor, '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) # compute user-user similarity matrix print('构建 user-user 相似度矩阵 ...') if load_save_sim: # if True: self.user_sim = util.load_data( '../data/sim/%s_08_uu_cucmemf_cv0.pkl' % self.config.dataset_name) else: for u1 in tqdm(self.rg.user): for u2 in self.rg.user: if u1 != u2: if self.user_sim.contains(u1, u2): continue # 皮尔逊相似度? 修改为余弦相似度?; sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2)) sim = round(sim, 5) self.user_sim.set(u1, u2, sim) if not os.path.exists('../data/sim'): os.makedirs('../data/sim') print('../data/sim folder has been established.') util.save_data( self.user_sim, '../data/sim/%s_08_uu_cucmemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of user if load_save_sim: self.user_k_neibor = util.load_data( '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, user_near_num)) for user in self.rg.user: matchUsers = sorted(self.user_sim[user].items(), key=lambda x: x[1], reverse=True)[kfold:user_near_num] matchUsers = matchUsers[:user_near_num] self.user_k_neibor[user] = dict(matchUsers) if not os.path.exists('../data/neibor'): os.makedirs('../data/neibor') print('../data/neibor folder has been established.') util.save_data( self.user_k_neibor, '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, user_near_num))
class SimBase(): def __init__(self): super(SimBase, self).__init__() self.config = ConfigX() self.user_sim = SimMatrix() self.item_sim = SimMatrix() self.user_k_neibor = defaultdict(dict) self.item_k_neibor = defaultdict(dict) def check_dataset(self): super(SimBase, self).check_dataset() # if config.dataset_name != 'db': # print("WARN: 注意 config.dataset_name 未设置为 'db' - douban movie") # # config.dataset_name = 'ml' # # sys.exit() def build_user_item_sim(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): """ 获取 user 与 item 的相似性 load_save_sim: 加载原有保存数据,提高测试速度 """ # 目前仅使用一个 SimCF # TODO: 下一步要混合多个Sim self.build_user_item_sim_CF(kfold, user_near_num=user_near_num, item_near_num=item_near_num, load_save_sim=load_save_sim) def build_user_item_sim_CF(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): self.rg = RatingGetter(kfold) self.mg = MetaGetter(kfold) from collections import defaultdict # compute item-item similarity matrix print('构建 item-item 相似度矩阵 ...') if load_save_sim: self.item_sim = util.load_data( '../data/sim/%s_08_ii_cucmemf_cv0.pkl' % self.config.dataset_name) else: # 封装 item 相似度计算 self.item_sim = self.mg.getSimMatrix(jaccard_sim) util.save_data( self.item_sim, '../data/sim/%s_08_ii_cucmemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of item if load_save_sim: self.item_k_neibor = util.load_data( '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) for item in self.mg.item: matchItems = sorted(self.item_sim[item].items(), key=lambda x: x[1], reverse=True)[:item_near_num] matchItems = matchItems[:item_near_num] self.item_k_neibor[item] = dict(matchItems) util.save_data( self.item_k_neibor, '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) # compute user-user similarity matrix print('构建 user-user 相似度矩阵 ...') if load_save_sim: # if True: self.user_sim = util.load_data( '../data/sim/%s_08_uu_cucmemf_cv0.pkl' % self.config.dataset_name) else: for u1 in tqdm(self.rg.user): for u2 in self.rg.user: if u1 != u2: if self.user_sim.contains(u1, u2): continue # 皮尔逊相似度? 修改为余弦相似度?; sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2)) sim = round(sim, 5) self.user_sim.set(u1, u2, sim) if not os.path.exists('../data/sim'): os.makedirs('../data/sim') print('../data/sim folder has been established.') util.save_data( self.user_sim, '../data/sim/%s_08_uu_cucmemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of user if load_save_sim: self.user_k_neibor = util.load_data( '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, user_near_num)) for user in self.rg.user: matchUsers = sorted(self.user_sim[user].items(), key=lambda x: x[1], reverse=True)[kfold:user_near_num] matchUsers = matchUsers[:user_near_num] self.user_k_neibor[user] = dict(matchUsers) if not os.path.exists('../data/neibor'): os.makedirs('../data/neibor') print('../data/neibor folder has been established.') util.save_data( self.user_k_neibor, '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, user_near_num))
class MF(object): """ docstring for MF the base class for matrix factorization based model-parent class """ def __init__(self, fixseed = True): super(MF, self).__init__() self.config = ConfigX() self.configc = ConfigCUC() cpprint(self.config.__dict__) #print the configuration # 打印数据统计 print_data_file_stats(self.config.rating_path) print_data_file_stats(self.config.trust_path) if fixseed: np.random.seed(seed=self.config.random_state) # 固定随机种子 # self.rg = RatingGetter() # loading raing data # self.init_model() self.iter_rmse = [] self.iter_mae = [] pass def init_model(self,k): self.read_data(k) # print("mf.py TODO: 打印真正的 rating 数量 = ", ) print("[%s] %s fold_round = %sth , user * item (m * n) = %s * %s" % ( self.config.dataset_name, self.__class__, k, # fold_num self.rg.get_train_size()[0], self.rg.get_train_size()[1])) print("[%s] %s self.config.factor = %s lamdaP = %s, lamdaQ = %s " % ( self.config.dataset_name, self.__class__, self.config.factor, self.config.lambdaP, self.config.lambdaQ)) # randome init size m * d np.random.seed(seed=self.config.random_state) # 固定随机种子 self.P = np.random.rand(self.rg.get_train_size()[0], self.config.factor) / ( self.config.factor ** 0.5) # latent user matrix # randome init size n * d np.random.seed(seed=self.config.random_state) # 固定随机种子 self.Q = np.random.rand(self.rg.get_train_size()[1], self.config.factor) / ( self.config.factor ** 0.5) # latent item matrix self.loss, self.lastLoss = 0.0, 0.0 self.lastRmse, self.lastMae = 10.0,10.0 pass def read_data(self,k): print("[%s] %s starting ... fold_round = %sth" % ( self.config.dataset_name, self.__class__, k, # fold_num )) # cv 数据 统计 cv_data_path = self.config.rating_cv_path + self.config.dataset_name + "-" + str(k) + ".csv" print_data_file_stats(cv_data_path) # 初始化 rg self.rg = RatingGetter(k) pass def train_model(self,k): self.init_model(k) pass # test all users in test set def predict_model(self): res = [] for ind, entry in enumerate(self.rg.testSet()): user, item, rating = entry rating_length = len(self.rg.trainSet_u[user]) # remove cold start users for test if rating_length <= self.config.coldUserRating: continue prediction = self.predict(user, item) # denormalize prediction = denormalize(prediction, self.config.min_val, self.config.max_val) pred = self.checkRatingBoundary(prediction) # add prediction in order to measure res.append([user, item, rating, pred]) rmse = Metric.RMSE(res) mae = Metric.MAE(res) self.iter_rmse.append(rmse) # for plot self.iter_mae.append(mae) return rmse, mae # test cold start users among test set def predict_model_cold_users(self): res = [] for user in self.rg.testColdUserSet_u.keys(): for item in self.rg.testColdUserSet_u[user].keys(): rating = self.rg.testColdUserSet_u[user][item] pred = self.predict(user, item) # pred = sigmoid(pred) # denormalize pred = denormalize(pred, self.config.min_val, self.config.max_val) pred = self.checkRatingBoundary(pred) res.append([user, item, rating, pred]) rmse = Metric.RMSE(res) mae = Metric.MAE(res) return rmse,mae def predict(self, u, i): if self.rg.containsUser(u) and self.rg.containsItem(i): return self.P[self.rg.user[u]].dot(self.Q[self.rg.item[i]]) elif self.rg.containsUser(u) and not self.rg.containsItem(i): return self.rg.userMeans[u] elif not self.rg.containsUser(u) and self.rg.containsItem(i): return self.rg.itemMeans[i] else: return self.rg.globalMean def checkRatingBoundary(self, prediction): prediction =round( min( max( prediction , self.config.min_val ) , self.config.max_val ) ,3) return prediction def isConverged(self, iter): from math import isnan if isnan(self.loss): print( 'Loss = NaN or Infinity: current settings does not fit the recommender! Change the settings and try again!') exit(-1) deltaLoss = (self.lastLoss - self.loss) rmse, mae = self.predict_model() # early stopping if self.config.isEarlyStopping == True: cond = self.lastRmse < rmse if cond: print('test rmse increase, so early stopping') return cond self.lastRmse = rmse self.lastMae = mae print('[%s] %s iteration %d: loss = %.4f, delta_loss = %.5f learning_Rate = %.5f rmse=%.5f mae=%.5f' % \ (self.config.dataset_name, self.__class__, iter, self.loss, deltaLoss, self.config.lr, rmse, mae)) # check if converged cond = abs(deltaLoss) < self.config.threshold converged = cond # if not converged: # self.updateLearningRate(iter) self.lastLoss = self.loss # shuffle(self.dao.trainingData) return converged def updateLearningRate(self, iter): if iter > 1: if abs(self.lastLoss) > abs(self.loss): self.config.lr *= 1.05 else: self.config.lr *= 0.5 if self.config.lr > 1: self.config.lr = 1 def show_rmse(self): ''' show figure for rmse and epoch ''' nums = range(len(self.iter_rmse)) plt.plot(nums, self.iter_rmse, label='RMSE') plt.plot(nums, self.iter_mae, label='MAE') plt.xlabel('# of epoch') plt.ylabel('metric') plt.title(self.__class__) plt.legend() plt.show() pass def show_loss(self,loss_all,faloss_all): ''' show figure for rmse and epoch ''' nums = range(len(loss_all)) plt.plot(nums, loss_all, label='front') plt.plot(nums, faloss_all, label='rear') plt.xlabel('# of epoch') plt.ylabel('loss') plt.title('loss experiment') plt.legend() plt.show() pass
def read_data(self, k): self.rg = RatingGetter(k) pass
def build_user_item_sim_CF(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): self.rg = RatingGetter(kfold) from collections import defaultdict # compute item-item similarity matrix print('构建 user-user 相似度矩阵 ...') if load_save_sim: self.user_sim = util.load_data( '../data/sim/db_08_uu_tricf_cv0.pkl') else: for u1 in self.rg.user: for u2 in self.rg.user: if u1 != u2: if self.user_sim.contains(u1, u2): continue # 皮尔逊相似度? 修改为余弦相似度; sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2)) sim = round(sim, 5) self.user_sim.set(u1, u2, sim) if not os.path.exists('../data/sim'): os.makedirs('../data/sim') print('../data/sim folder has been established.') util.save_data(self.user_sim, '../data/sim/db_08_uu_tricf_cv0.pkl') # compute the k neighbors of user # self.user_k_neibor = util.load_data( # '../data/neibor/db_08_uu_' + str(user_near_num) + '_neibor_tricf.pkl') for user in self.rg.user: matchUsers = sorted(self.user_sim[user].items(), key=lambda x: x[1], reverse=True)[kfold:user_near_num] matchUsers = matchUsers[:user_near_num] self.user_k_neibor[user] = dict(matchUsers) if not os.path.exists('../data/neibor'): os.makedirs('../data/neibor') print('../data/neibor folder has been established.') util.save_data( self.user_k_neibor, '../data/neibor/db_08_uu_' + str(user_near_num) + '_neibor_tricf_cv0.pkl') # compute item-item similarity matrix print('构建 item-item 相似度矩阵 ...') if load_save_sim: self.item_sim = util.load_data( '../data/sim/db_08_ii_tricf_cv0.pkl') else: for i1 in self.rg.item: for i2 in self.rg.item: if i1 != i2: if self.item_sim.contains(i1, i2): continue # 皮尔逊相似度? 修改为余弦相似度; sim = pearson_sp(self.rg.get_col(i1), self.rg.get_col(i2)) sim = round(sim, 5) self.item_sim.set(i1, i2, sim) util.save_data(self.item_sim, '../data/sim/db_08_ii_tricf_cv0.pkl') # compute the k neighbors of item # self.item_k_neibor = util.load_data( # '../data/neibor/db_08_ii_' + str(item_near_num) + '_neibor_tricf.pkl') for item in self.rg.item: matchItems = sorted(self.item_sim[item].items(), key=lambda x: x[1], reverse=True)[:item_near_num] matchItems = matchItems[:item_near_num] self.item_k_neibor[item] = dict(matchItems) util.save_data( self.item_k_neibor, '../data/neibor/db_08_ii_' + str(item_near_num) + '_neibor_tricf_cv0.pkl') pass
class MF(object): """ docstring for MF the base class for matrix factorization based model-parent class """ def __init__(self): super(MF, self).__init__() self.config = ConfigX() self.rg = RatingGetter() # loading raing data # self.init_model() self.iter_rmse = [] self.iter_mae = [] pass def init_model(self): self.P = np.random.rand(self.rg.get_train_size()[0], self.config.factor) / ( self.config.factor ** 0.5) # latent user matrix self.Q = np.random.rand(self.rg.get_train_size()[1], self.config.factor) / ( self.config.factor ** 0.5) # latent item matrix self.Bu = np.random.rand(self.rg.get_train_size()[0]) # bias value of user self.Bi = np.random.rand(self.rg.get_train_size()[1]) # bais value of item print(self.rg.get_train_size()[0]) print(self.rg.get_train_size()[1]) self.loss, self.lastLoss = 0.0, 0.0 self.lastRmse, self.lastMae = 10.0, 10.0 pass def train_model(self): pass def valid_model(self): res = [] for ind, entry in enumerate(self.rg.validSet()): user, item, rating = entry # predict prediction = self.predict(user, item) # denormalize prediction = denormalize(prediction, self.config.min_val, self.config.max_val) pred = self.checkRatingBoundary(prediction) # add prediction in order to measure # self.dao.testData[ind].append(pred) res.append([user, item, rating, pred]) rmse = Metric.RMSE(res) mae = Metric.MAE(res) self.iter_rmse.append(rmse) # for plot self.iter_mae.append(mae) return rmse, mae # test all users in test set def predict_model(self): res = [] for ind, entry in enumerate(self.rg.testSet()): user, item, rating = entry # predict prediction = self.predict(user, item) # denormalize prediction = denormalize(prediction, self.config.min_val, self.config.max_val) pred = self.checkRatingBoundary(prediction) # add prediction in order to measure # self.dao.testData[ind].append(pred) res.append([user, item, rating, pred]) rmse = Metric.RMSE(res) mae = Metric.MAE(res) print('learning_Rate = %.5f rmse=%.5f mae=%.5f' % \ (self.config.lr, rmse, mae)) # self.iter_rmse.append(rmse) # for plot # self.iter_mae.append(mae) return rmse, mae # test cold start users among test set def predict_model_cold_users(self): res = [] for user in self.rg.testColdUserSet_u.keys(): for item in self.rg.testColdUserSet_u[user].keys(): rating = self.rg.testColdUserSet_u[user][item] pred = self.predict(user, item) # denormalize pred = denormalize(pred, self.config.min_val, self.config.max_val) pred = self.checkRatingBoundary(pred) res.append([user, item, rating, pred]) rmse = Metric.RMSE(res) return rmse def predict(self, u, i): if self.rg.containsUser(u) and self.rg.containsItem(i): u = self.rg.user[u] i = self.rg.item[i] return self.P[u].dot(self.Q[i]) + self.rg.globalMean + self.Bi[i] + self.Bu[u] elif self.rg.containsUser(u) and not self.rg.containsItem(i): return self.rg.userMeans[u] elif not self.rg.containsUser(u) and self.rg.containsItem(i): return self.rg.itemMeans[i] else: return self.rg.globalMean def checkRatingBoundary(self, prediction): if prediction > self.config.max_val: return self.config.max_val elif prediction < self.config.min_val: return self.config.min_val else: return round(prediction, 3) def save_P(self, file_name): P_dic = defaultdict(list) with open(file_name, 'wb') as f: print(self.rg.get_train_size()[0]) for i in range(self.rg.get_train_size()[0]): if i in self.rg.id2user.keys(): user = self.rg.id2user[i] P_dic[user] = self.P[i] pickle.dump(P_dic, f) def save_Q(self, file_name): Q_dic = defaultdict(list) with open(file_name, 'wb') as f: print(self.rg.get_train_size()[1]) for i in range(self.rg.get_train_size()[1]): if i in self.rg.id2item.keys(): user = self.rg.id2item[i] Q_dic[user] = self.Q[i] pickle.dump(Q_dic, f) def isConverged(self, iter): from math import isnan if isnan(self.loss): print( 'Loss = NaN or Infinity: current settings does not fit the recommender! Change the settings and try again!') exit(-1) # measure = self.performance() # value = [item.strip()for item in measure] # with open(self.algorName+' iteration.txt') deltaLoss = (self.lastLoss - self.loss) rmse, mae = self.valid_model() # early stopping if self.config.isEarlyStopping == True: cond = self.lastRmse < rmse if cond: print('test rmse increase, so early stopping') # P_path = '../data/P/P_path_100_feas.pkl' # self.save_P(P_path) # Q_path = '../data/Q/Q_300_feas.pkl' # self.save_Q(Q_path) return cond self.lastRmse = rmse self.lastMae = mae print('%s iteration %d: loss = %.4f, delta_loss = %.5f learning_Rate = %.5f rmse=%.5f mae=%.5f' % \ (self.__class__, iter, self.loss, deltaLoss, self.config.lr, rmse, mae)) # check if converged cond = abs(deltaLoss) < self.config.threshold converged = cond # if not converged: # self.updateLearningRate(iter) self.lastLoss = self.loss # shuffle(self.dao.trainingData) return converged def updateLearningRate(self, iter): if iter > 1: if abs(self.lastLoss) > abs(self.loss): self.config.lr *= 1.05 else: self.config.lr *= 0.5 if self.config.lr > 1: self.config.lr = 1 def show_rmse(self): ''' show figure for rmse and epoch ''' nums = range(len(self.iter_rmse)) plt.plot(nums, self.iter_rmse, label='RMSE') plt.plot(nums, self.iter_mae, label='MAE') plt.xlabel('# of epoch') plt.ylabel('metric') plt.title(self.__class__) plt.legend() plt.show() pass