def __init__(self, train_filename, topic_num, split_sig, time_format, uin, iin, timein, read_from_file=False): self.train_filename = train_filename self.topic_num = topic_num self.checks = read_checks_table(train_filename, split_sig=split_sig, time_format=time_format, uin=uin, iin=iin, timein=timein) self.check_set = read_dic_set(train_filename, split_tag=split_sig, oin=uin, ain=iin) print(len({k for k,v in self.check_set.items()})) dir_name = 'mid_data/' + '-'.join(train_filename.split('.')[:-1]) + '/tim-plsa' + str(topic_num) + 't/' u_in_z_filename = dir_name + 'pr_u_in_z.txt' i_in_z_filename = dir_name + 'pr_i_in_z.txt' z_filename = dir_name + 'pz.txt' pr_filename = dir_name + 'pr.txt' t_in_z_filename = dir_name + 'pr_t_in_z.txt' if not os.path.exists(dir_name): os.mkdir(dir_name) if os.path.exists(u_in_z_filename) and os.path.exists(i_in_z_filename) and os.path.exists( z_filename) and os.path.exists(pr_filename) and os.path.exists(t_in_z_filename): self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr_t_in_z = read_obj(u_in_z_filename), read_obj( i_in_z_filename), read_obj(z_filename), read_obj(t_in_z_filename) else: self.pr_u_in_z, self.pr_i_in_z, self.pr_t_in_z, self.pz, self.pr = self.init_data() self.em_loop() write_obj(u_in_z_filename, self.pr_u_in_z) write_obj(i_in_z_filename, self.pr_i_in_z) write_obj(z_filename, self.pz) write_obj(pr_filename, self.pr) write_obj(t_in_z_filename, self.pr_t_in_z) print(self.pz)
def __init__(self, train_filename, topic_num, split_sig, time_format, uin, iin, timein, read_from_file=False): self.train_filename = train_filename self.topic_num = topic_num self.checks = read_checks_table(train_filename, split_sig=split_sig, time_format=time_format, uin=uin, iin=iin, timein=timein) dir_name = 'mid_data/' + '-'.join( train_filename.split('.')[:-1]) + '/plsa' + str(topic_num) + 't/' u_in_z_filename = dir_name + 'pr_u_in_z.txt' i_in_z_filename = dir_name + 'pr_i_in_z.txt' z_filename = dir_name + 'pz.txt' pr_filename = dir_name + 'pr.txt' z_in_u_filename = dir_name + 'pr_z_in_u.txt' user_list_file = dir_name + 'users' items_list_file = dir_name + 'items' if not os.path.exists(dir_name): os.mkdir(dir_name) if os.path.exists(u_in_z_filename) and os.path.exists( i_in_z_filename) and os.path.exists( z_filename) and os.path.exists( pr_filename) and os.path.exists(z_in_u_filename): self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr_z_in_u = read_obj( u_in_z_filename), read_obj(i_in_z_filename), read_obj( z_filename), read_obj(z_in_u_filename) # self.pr = read_obj(pr_filename) else: self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr = MyLDA.init_data( topic_num, self.checks) MyLDA.em_loop(self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr, self.checks) self.pr_z_in_u = {} for u in self.checks.keys(): self.pr_z_in_u[u] = {} for z in self.pz.keys(): self.pr_z_in_u[u][z] = 0 for check in self.checks[u]: i = check[0] self.pr_z_in_u[u][z] += self.pr[(u, i)][z] dic_value_reg_one(self.pr_z_in_u[u]) write_obj(u_in_z_filename, self.pr_u_in_z) write_obj(i_in_z_filename, self.pr_i_in_z) write_obj(z_filename, self.pz) write_obj(pr_filename, self.pr) write_obj(z_in_u_filename, self.pr_z_in_u) print(self.pz)
def __init__(self, checks, K, dirname): self.checks = checks self.K = K p_name = dirname + 'p.txt' q_name = dirname + 'q.txt' if os.path.exists(p_name) and os.path.exists(q_name): self.p, self.q = read_obj(p_name), read_obj(q_name) else: self.p, self.q = MyMFModel.gradAscent(self.checks, K) write_obj(p_name, self.p) write_obj(q_name, self.q)
def __init__(self, h): self.h = h if os.path.exists(self.name): self.cache = read_obj(self.name) else: self.cache = {} self.size = len(self.cache)
def __init__(self, dir_name): self.dir_name = dir_name if os.path.exists(self.name): self.cache = read_obj(self.name) else: self.cache = {} self.size = len(self.cache)
def main(train_file, test_file, feature_num, topks): if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/'): os.makedirs('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/') nprs = [] nres = [] print('read_table') checks = read_checks_table(train_file, uin=0, iin=1) test = read_checks_table(test_file, uin=0, iin=1) sim_fun_name = 'pmf' + str(feature_num) + 't/' dir_name = 'mid_data/' + '-'.join( train_file.split('.')[:-1]) + '/' + sim_fun_name if not os.path.exists(dir_name): os.makedirs(dir_name) mf_model = MyMFModel(checks, K=feature_num, dirname=dir_name) # for topn in topns: ex_rec_name = dir_name + '-'.join(['ex_rec']) + '.txt' if os.path.exists(ex_rec_name): print('read recommend result from file') rec = read_obj(ex_rec_name) for k, v in rec.items(): v.reverse() else: print('recommend') rec = mf_model.recommend() write_obj(ex_rec_name, rec) prs = [] res = [] for topk in topks: pr = precision(rec, test, topk) print(pr) re = recall(rec, test, topk) prs.append(float('%.4f' % pr)) res.append(float('%.4f' % re)) nprs.append(prs.copy()) nres.append(res.copy()) out_json_to_file(dir_name + 'pr.txt', nprs) out_json_to_file(dir_name + 're.txt', nres) return nprs, nres
def cf_main(train_file, test_file, topns=None, topks=None, topic_num=8): start = datetime.now() if topks is None: topks = [20] if topns is None: topns = [20] nprs = [] nres = [] print('read_table') # table = read_checks_table(train_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, # time_format='%Y-%m-%dT%H:%M:%SZ') # test = read_checks_table(test_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, # time_format='%Y-%m-%dT%H:%M:%SZ') table = read_checks_table(train_file, split_sig='\t', uin=0, iin=1, timein=7, scorein=None, time_format='%a %b %d %H:%M:%S %z %Y') test = read_checks_table(test_file, split_sig='\t', uin=0, iin=1, timein=7, scorein=None, time_format="%a %b %d %H:%M:%S %z %Y") # table = read_checks_table(train_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # test = read_checks_table(test_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # friends_dic = read_dic_set('Gowalla_edges.txt') if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/'): os.mkdir('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/') # ========= LDA ================ # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=4, timein=1, # time_format='%Y-%m-%dT%H:%M:%SZ') # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig=',', uin=0, iin=4, timein=3, time_format='%Y-%m-%d %H:%M:%S') lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=1, timein=7, time_format="%a %b %d %H:%M:%S %z %Y") # sim_fun = lambda u1, u2: lda.sim(u1, u2) predict_fun = lda.predict # ''' sim_fun_name = 'tim-plsa' + str(topic_num) + 't' dir_name = 'mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/' + sim_fun_name + '/' sim_name = dir_name + 'sim.txt' if not os.path.exists(dir_name): os.mkdir(dir_name) # if os.path.exists(sim_name): # print('read sim metrics from file') # sim_metrics = read_obj(sim_name) # else: # print('cal_sim_mat') # sim_metrics = cal_sim_mat(table, similar_fun=sim_fun) # write_obj(sim_name, sim_metrics) for topn in topns: ex_rec_name = dir_name + '-'.join(['ex_rec', sim_fun_name, str(topn)]) + '.txt' if os.path.exists(ex_rec_name): print('read recommend result from file') rec = read_obj(ex_rec_name) else: print('recommend') users = set(table.keys()) items = set() zp = ZPriorityQ(maxsize=1000) for z in range(len(lda.pr_i_in_z)): for i in range(len(lda.pr_i_in_z[z, :])): zp.enQ(KVTtem(i, lda.pr_i_in_z[z, i])) items.update([e.k for e in zp.items]) print(len(items)) # for item, v in lda.pr_i_in_z[0].items(): # items.add(item) rec = exclude_recommend(table, users, items, predict_fun) # write_obj(rec_name, rec) # exclude_dup(table, rec) write_obj(ex_rec_name, rec) prs = [] res = [] for topk in topks: print('precision') pr = precision(rec, test, topk) print(pr) re = recall(rec, test, topk) print('recall') prs.append(float('%.4f' % pr)) res.append(float('%.4f' % re)) # print('y1=',prs) # print('y2=',res) nprs.append(prs.copy()) nres.append(res.copy()) out_json_to_file(dir_name + 'nprs.txt', nprs) out_json_to_file(dir_name + 'nres.txt', nres) end = datetime.now() print('the cost time is ', (end - start).seconds) return nprs, nres
from pprint import pprint from numpy import array, arange import matplotlib.pyplot as plt from rec_lib.utils import read_obj uinz = read_obj( 'mid_data/trainid-id-dataset_TSMC2014_NYC/tim-plsa7t/pr_t_in_z.txt') for zd in uinz: plt.plot(arange(24), zd) plt.show()
def __init__(self, checks, K, dirname): self.K = K p_name = dirname + 'p.txt' q_name = dirname + 'q.txt' user_list_name = dirname + 'user-list.txt' item_list_name = dirname + 'item-list.txt' if os.path.exists(p_name) and os.path.exists( q_name) and os.path.exists(user_list_name) and os.path.exists( item_list_name): self.p, self.q, self.users, self.items = read_obj(p_name), \ read_obj(q_name), \ read_obj(user_list_name), \ read_obj(item_list_name) self.M = len(self.users) self.N = len(self.items) print(self.M, self.N) self.user_index = { self.users[u]: u for u in range(len(self.users)) } self.item_index = { self.items[i]: i for i in range(len(self.items)) } self.R = {} for ou in checks.keys(): u = self.user_index[ou] if not self.R.__contains__(u): self.R[u] = {} for check in checks[u]: i = self.item_index[check[0]] if not self.R[u].__contains__(i): self.R[u][i] = 0 self.R[u][i] = 1 else: print('init user, items index') users = set() items = set() for u in checks.keys(): users.add(u) for check in checks[u]: i = check[0] items.add(i) self.users = list(users) self.items = list(items) self.M = len(self.users) self.N = len(self.items) print(self.M, self.N) self.user_index = { self.users[u]: u for u in range(len(self.users)) } self.item_index = { self.items[i]: i for i in range(len(self.items)) } self.R = {} for ou in checks.keys(): u = self.user_index[ou] if not self.R.__contains__(u): self.R[u] = {} for check in checks[u]: i = self.item_index[check[0]] if not self.R[u].__contains__(i): self.R[u][i] = 0 self.R[u][i] += 1 # 初始化 参数 列表 成矩阵 self.p, self.q = self.gradAscent() write_obj(p_name, self.p) write_obj(q_name, self.q) write_obj(user_list_name, self.users) write_obj(item_list_name, self.items)
# loc_center = read_center('trainRF-SH-FoursquareLocationCenter.csv') # user_center = read_center('trainRF-SH-FoursquareUserCenter.csv') loc_center = read_center('trainRF-NA-Gowalla_LocCenter.txt') user_center = read_center('trainRF-NA-Gowalla_UserCenter.txt') # 这个流行度好像和 距离没法融合 # loc_users = read_location_users(train_file) # maxu = max([len(users) for loc, users in loc_users.items()]) # pop_inf = {loc: len(users)/maxu for loc, users in loc_users.items()} # rec_file = 'mid_data/trainRF-SH-FoursquareCheckins/0.5-0.3-soc0.5-sq_score1d-cosine_1/[0.3, 0.2, 0.5]/ex_rec-5.txt' rec_file = 'mid_data/trainRF-NA-Gowalla_totalCheckins/0.5-0.3-soc0.5-sq_score1d-cosine_1/[0.3, 0.1, 0.6]/ex_rec-5.txt' orec = read_obj(rec_file) geo_inf = GeoInf(a=0.84534522188, b=-1.61667304945, checks=table, loc_center=loc_center, user_center=user_center) # geo_inf = GeoInf(a=0.651, b=-1.628, checks=table, loc_center=loc_center, user_center=user_center) # locs = set(loc_center.keys()) # users = ["11823", "10362", "11588", "16457", "2738", "7380", "1676", "2270", "9429", "10650", "9488", "10320", "2461", "4330", "9565", "8895", "16248", "16201", "16633", "14710", "9632", "4962", "10579", "16057", "7836", "4971", "12417", "6791", "16181", "6533", "322", "132", "11998", "2882", "10184", "15244", "15469", "9210", "15982", "685", "1147", "7313", "6390", "11391", "13552", "4421", "11881", "2953", "10025", "4610", "15455", "7744", "11512", "13107", "11328", "2153", "2150", "13310", "10554", "17003", "4343", "17836", "13097", "3510", "7806", "15655", "70", "15838", "17717", "17390", "4282", "16446", "15078", "6074", "9504", "12785", "740", "8525", "16427", "2188", "11119"] # rate = 0 pre = [] re = []
def cf_main(train_file, test_file, topns=None, topks=None, topic_num=8): if topks is None: topks = [20] if topns is None: topns = [20] nprs = [] nres = [] print('read_table') table = read_checks_table(train_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, time_format='%Y-%m-%dT%H:%M:%SZ') test = read_checks_table(test_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, time_format='%Y-%m-%dT%H:%M:%SZ') # table = read_checks_table(train_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # test = read_checks_table(test_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # ''' # friends_dic = read_dic_set('Gowalla_edges.txt') if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/'): os.mkdir('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/') # ========= LDA ================ lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=4, timein=1, time_format='%Y-%m-%dT%H:%M:%SZ') # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig=',', uin=0, iin=4, timein=3, time_format='%Y-%m-%d %H:%M:%S') # sim_fun = lambda u1, u2: lda.sim(u1, u2) predict_fun = lda.predict # ''' sim_fun_name = 'lda' + str(topic_num) + 't' dir_name = 'mid_data/' + '-'.join( train_file.split('.')[:-1]) + '/' + sim_fun_name + '/' sim_name = dir_name + 'sim.txt' if not os.path.exists(dir_name): os.mkdir(dir_name) # if os.path.exists(sim_name): # print('read sim metrics from file') # sim_metrics = read_obj(sim_name) # else: # print('cal_sim_mat') # sim_metrics = cal_sim_mat(table, similar_fun=sim_fun) # write_obj(sim_name, sim_metrics) for topn in topns: rec_name = dir_name + '-'.join(['rec', sim_fun_name, str(topn)]) + '.txt' ex_rec_name = dir_name + '-'.join(['ex_rec', sim_fun_name, str(topn)]) + '.txt' if os.path.exists(ex_rec_name): print('read recommend result from file') rec = read_obj(ex_rec_name) else: print('recommend') users = set(table.keys()) items = set() for z, zis in lda.pr_i_in_z.items(): items.update( [e[0] for e in sort_dict(lda.pr_i_in_z[z])[:1000]]) print(len(items)) # for item, v in lda.pr_i_in_z[0].items(): # items.add(item) rec = exclude_recommend(table, users, items, predict_fun) # write_obj(rec_name, rec) # exclude_dup(table, rec) write_obj(ex_rec_name, rec) prs = [] res = [] for topk in topks: print('precision') pr = precision(rec, test, topk) print(pr) re = recall(rec, test, topk) print('recall') prs.append(float('%.4f' % pr)) res.append(float('%.4f' % re)) # print('y1=',prs) # print('y2=',res) nprs.append(prs.copy()) nres.append(res.copy()) out_json_to_file(dir_name + 'nprs.txt', nprs) out_json_to_file(dir_name + 'nres.txt', nres) return nprs, nres
def __init__(self, train_filename, friends_file, topic_num, split_sig, time_format, uin, iin, timein, read_from_file=False): self.train_filename = train_filename self.topic_num = topic_num self.checks = read_checks_table(train_filename, split_sig=split_sig, time_format=time_format, uin=uin, iin=iin, timein=timein) self.friends = read_dic_set(friends_file, split_tag=split_sig, oin=0, ain=1) self.check_set = read_dic_set(train_filename, split_tag=split_sig, oin=0, ain=4) dir_name = 'mid_data/' + '-'.join(train_filename.split( '.')[:-1]) + '/' + self.name + str(topic_num) + 't/' f_in_z_filename = dir_name + 'pr_f_in_z.txt' i_in_z_filename = dir_name + 'pr_i_in_z.txt' u_in_f_filename = dir_name + 'pr_u_in_f.txt' z_filename = dir_name + 'pz.txt' pr_filename = dir_name + 'pr.txt' max_user = max(self.check_set.keys()) for i in range(max_user + 1): if self.friends.__contains__(i): self.friends[i].add(i) else: self.friends[i] = {i} if not os.path.exists(dir_name): os.mkdir(dir_name) if os.path.exists(f_in_z_filename) and os.path.exists(i_in_z_filename) \ and os.path.exists(z_filename) and os.path.exists(pr_filename) \ and os.path.exists(u_in_f_filename): self.pr_f_in_z, \ self.pr_i_in_z, \ self.pr_u_in_f, \ self.pz = read_obj(f_in_z_filename), \ read_obj(i_in_z_filename), \ read_obj(u_in_f_filename),\ read_obj(z_filename) else: self.pr_f_in_z, self.pr_i_in_z, self.pr_u_in_f, self.pz, self.pr = self.init_data( ) self.em_loop() self.pr_z_in_u = {} # for u in self.checks.keys(): # self.pr_z_in_u[u] = {} # for z in range(len(self.pz)): # self.pr_z_in_u[u][z] = 0 # for check in self.checks[u]: # i = check[0] # self.pr_z_in_u[u][z] += self.pr[(u, i)][z] # dic_value_reg_one(self.pr_z_in_u[u]) write_obj(f_in_z_filename, self.pr_f_in_z) write_obj(i_in_z_filename, self.pr_i_in_z) write_obj(z_filename, self.pz) write_obj(pr_filename, self.pr) write_obj(u_in_f_filename, self.pr_u_in_f) print(self.pz)
keys = list(keys) index = {} for i in range(len(keys)): index[keys[i]] = i return index def dic_to_mat(index, dic): rmat = np.zeros(shape=(len(index), len(index))) for u1, uss in dic.items(): for u2, s in uss.items(): rmat[index[u1], index[u2]] = s return rmat if __name__ == '__main__': sim_map1 = read_obj( '../mid_data/trainRF-SH-FoursquareCheckins/1-0.5-0.3-soc-group0-soc-group1-soc-group2/soc-group0' ) sim_map2 = read_obj( '../mid_data/trainRF-SH-FoursquareCheckins/1-0.5-0.3-soc-group0-soc-group1-soc-group2/soc-group1' ) # sim_map2 = {u: {f[0]: f[1] for f in fs} for u, fs in sim_map2.items()} index = keys_to_index(sim_map1.keys()) m1 = dic_to_mat(index, sim_map1) m2 = dic_to_mat(index, sim_map2) m = m1 * 0.5 + m2 * 0.5 print(m)