def cal_comm_mat_sm(path_str): ''' calculate commuting matrix for U-*-U-pos-B style in merge way with 7 simple motifs (sm) ''' uid_filename = dir_ + 'uids.txt' bid_filename = dir_ + 'bids.txt' ub_filename = dir_ + 'uid_bid.txt' print 'cal commut mat with motif for %s, filenames: %s, %s, %s' % ( path_str, uid_filename, bid_filename, ub_filename) uids, uid2ind, ind2uid = load_eids(uid_filename, 'user') bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz') # upb = np.loadtxt(upb_filename, dtype=np.int64) ub = np.loadtxt(ub_filename, dtype=np.int64) # adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind) adj_ub, adj_ub_t = generate_adj_mat(ub, uid2ind, bid2ind) social_filename = dir_ + 'user_social.txt' uu = np.loadtxt(social_filename, dtype=np.int64) adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind) motif_matrix = compute_motif_matrix(adj_uu, adj_uu_t, path_str) if path_str[:3] == 'UUB': base_matrix = adj_uu if path_str[:4] == 'UBUB': base_matrix = adj_ub.dot(adj_ub_t) #for n in range(1, 10): for n in range(11): alpha = n * 0.1 UBU_merge = (1 - alpha) * base_matrix + alpha * motif_matrix start = time.time() UBUB = UBU_merge.dot(adj_ub) print 'UBUB(%s), density=%.5f cost %.2f seconds' % ( UBUB.shape, UBUB.nnz * 1.0 / UBUB.shape[0] / UBUB.shape[1], time.time() - start) start = time.time() K = 500 #normal way triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K) wfilename = dir_ + 'sim_res/path_count/%s_%s_top%s.res' % (path_str, alpha, K) save_triplets(wfilename, triplets) print 'finish saving %s %s entries in %s, cost %.2f seconds' % ( len(triplets), path_str, wfilename, time.time() - start)
def cal_comm_mat_UBB(path_str): ''' 200k ratings calculate the commuting matrix in U-B-*-B style in fact, only need to calculate BB ''' print "path str:", path_str uid_filename = dir_ + 'uids.txt'#users print 'run cal_comm_mat_samples for 10k users in ', uid_filename lines = open(uid_filename, 'r').readlines() uids = [int(l.strip()) for l in lines] uid2ind = {v:k for k,v in enumerate(uids)} ind2uid = reverse_map(uid2ind) bid_filename = dir_ + 'bids.txt'#items lines = open(bid_filename, 'r').readlines() bids = [int(l.strip()) for l in lines] bid2ind = {v:k for k,v in enumerate(bids)} ind2bid = reverse_map(bid2ind) upb_filename = dir_ + 'uid_pos_bid.txt'# positive rating upb = np.loadtxt(upb_filename, dtype=int) # generate users items adjacency matrix adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind) # print uid2ind[640698], bid2ind[51874] # print type(adj_ub), adj_ub.toarray()[uid2ind[640698]][bid2ind[51874]], adj_ub.toarray().shape # print type(adj_ub_t), adj_ub_t.toarray()[bid2ind[51874]][uid2ind[640698]], adj_ub_t.toarray().shape # generate items object adjacency matrix (cat, state, city, star) adj_bo, adj_bo_t = get_bo(path_str, bid2ind) t1 = time.time() # compute u-> b -> o(cat,city) <- b comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t) t2 = time.time() print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1) print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0/comm_res.shape[0]/comm_res.shape[1]) K = 500 wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K) triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K) save_triplets(wfilename, triplets) #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid) t3 = time.time() print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
def get_bo(path_str, bid2ind): #U-pos-B-Cat-B if 'State' in path_str: sfilename = dir_ + 'bid_state.txt' elif 'Cat' in path_str: sfilename = dir_ + 'bid_cat.txt' elif 'City' in path_str: sfilename = dir_ + 'bid_city.txt' elif 'Star' in path_str: sfilename = dir_ + 'bid_stars.txt' lines = open(sfilename, 'r').readlines() parts = [l.strip().split() for l in lines] bos = [(int(b), int(o)) for b,o in parts] ond2ind = {v:k for k,v in enumerate(set([o for _, o in bos]))} ind2ond = reverse_map(ond2ind) adj_bo, adj_bo_t = generate_adj_mat(bos, bid2ind, ond2ind) return adj_bo, adj_bo_t
def cal_rar(path_str): aid_filename = dir_ + 'aids.txt' rid_filename = dir_ + 'rids.txt' aids = open(aid_filename, 'r').readlines() aids = [int(r.strip()) for r in aids] aid2ind = {a: ind for ind, a in enumerate(aids)} #global ind ind2aid = reverse_map(aid2ind) rids = open(rid_filename, 'r').readlines() rids = [int(r.strip()) for r in rids] rid2ind = {r: ind for ind, r in enumerate(rids)} #global ind ind2rid = reverse_map(rid2ind) if 'P' in path_str: ura_filename = dir_ + 'uid_rid_pos_aid_weight.txt' elif 'N' in path_str: ura_filename = dir_ + 'uid_rid_neg_aid_weight.txt' ura = np.loadtxt(ura_filename, dtype=np.float64) ra = ura[:, (1, 2, 3)] ra = [(int(r), int(a), w) for r, a, w in ra] adj_ra, adj_ra_t = generate_adj_mat(ra, rid2ind, aid2ind, is_weight=True) t1 = time.time() RA = adj_ra.toarray() t2 = time.time() print 'to dense RA%s cost %.2f seconds' % (RA.shape, t2 - t1) RAR_csr = cal_rar_block(RA, len(rid2ind), ind2rid, step=20000) print 'finish cal rar by blocks, cost %.2f minutes' % ( (time.time() - t2) / 60.0) try: wfilename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str fw = open(wfilename, 'w+') pickle.dump(RAR_csr, fw, pickle.HIGHEST_PROTOCOL) map_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str fw = open(map_filename, 'w+') pickle.dump(ind2rid, fw, pickle.HIGHEST_PROTOCOL) print 'finish saving sparse mat in ', wfilename except Exception as e: print e
def cal_comm_mat_UBB(path_str): ''' 200k ratings calculate the commuting matrix in U-B-*-B style in fact, only need to calculate BB ''' uid_filename = dir_ + 'uids.txt' print 'run cal_comm_mat_samples for 10k users in ', uid_filename lines = open(uid_filename, 'r').readlines() uids = [int(l.strip()) for l in lines] uid2ind = {v: k for k, v in enumerate(uids)} ind2uid = reverse_map(uid2ind) bid_filename = dir_ + 'bids.txt' lines = open(bid_filename, 'r').readlines() bids = [int(l.strip()) for l in lines] bid2ind = {v: k for k, v in enumerate(bids)} ind2bid = reverse_map(bid2ind) upb_filename = dir_ + 'uid_pos_bid.txt' upb = np.loadtxt(upb_filename, dtype=int) adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind) adj_bo, adj_bo_t = get_bo(path_str, bid2ind) t1 = time.time() comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t) t2 = time.time() print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1) print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0 / comm_res.shape[0] / comm_res.shape[1]) K = 500 wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K) triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K) save_triplets(wfilename, triplets) #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid) t3 = time.time() print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
def cal_comm_mat_USUB(path_str): ''' Given meta_structure_str, generate the commuting matrix e.g. 'user-review-business,t10_aspect-review-user' ''' uid_filename = dir_ + 'uids.txt' bid_filename = dir_ + 'bids.txt' aid_filename = dir_ + 'aids.txt' rid_filename = dir_ + 'rids.txt' upb_filename = dir_ + 'uid_pos_bid.txt' print 'cal commut mat for %s, filenames: %s, %s, %s' % (path_str, uid_filename, bid_filename, upb_filename) uids, uid2ind, ind2uid = load_eids(uid_filename, 'user') bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz') aids, aid2ind, ind2aid = load_eids(aid_filename, 'aspect') upb = np.loadtxt(upb_filename, dtype=np.int64) adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind) if 'P' in path_str: urb_filename = dir_ + 'uid_rid_pos_bid.txt' ura_filename = dir_ + 'uid_rid_pos_aid.txt' ind2rid_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str rar_mat_filename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str elif 'N' in path_str: urb_filename = dir_ + 'uid_rid_neg_bid.txt' ura_filename = dir_ + 'uid_rid_neg_aid.txt' ind2rid_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str rar_mat_filename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str f = open(ind2rid_filename, 'r') ind2rid = pickle.load(f) rid2ind = reverse_map(ind2rid) urb = np.loadtxt(urb_filename, dtype=np.int64) ura = np.loadtxt(ura_filename, dtype=np.int64) ur = urb[:,(0,1)] adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind) rb = urb[:,(1,2)] adj_rb, adj_rb_t = generate_adj_mat(rb, rid2ind, bid2ind) ra = ura[:,(1,2)] adj_ra, adj_ra_t = generate_adj_mat(ra, rid2ind, aid2ind) start = time.time() RBR = adj_rb.dot(adj_rb_t) print 'RBR(%s), density=%.5f cost %.2f seconds' % (RBR.shape, RBR.nnz * 1.0/RBR.shape[0]/RBR.shape[1], time.time() - start) start = time.time() #RAR = adj_ra.dot(adj_ra_t) f = open(rar_mat_filename, 'r') RAR = pickle.load(f) print 'load RAR(%s), density=%.5f cost %.2f seconds' % (RAR.shape, RAR.nnz * 1.0/RAR.shape[0]/RAR.shape[1], time.time() - start) start = time.time() RSR = RBR.multiply(RAR) print 'RSR(%s), density=%.5f cost %.2f seconds' % (RSR.shape, RSR.nnz * 1.0/RSR.shape[0]/RSR.shape[1], time.time() - start) start = time.time() URSR = adj_ur.dot(RSR) print 'URSR(%s), density=%.5f cost %.2f seconds' % (URSR.shape, URSR.nnz * 1.0/URSR.shape[0]/URSR.shape[1], time.time() - start) start = time.time() URSRU = URSR.dot(adj_ur_t) print 'URSRU(%s), density=%.5f cost %.2f seconds' % (URSRU.shape, URSRU.nnz * 1.0/URSRU.shape[0]/URSRU.shape[1], time.time() - start) start = time.time() URSRUB = URSRU.dot(adj_upb) print 'URSRUB(%s), density=%.5f cost %.2f seconds' % (URSRUB.shape, URSRUB.nnz * 1.0/URSRUB.shape[0]/URSRUB.shape[1], time.time() - start) start = time.time() K = 500 wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K) #wfilename = dir_ + 'sim_res/path_count/%s.res' % path_str batch_save_comm_res(path_str, wfilename, URSRUB, ind2uid, ind2bid) print 'finish saving %s %s entries in %s, cost %.2f seconds' % (URSRUB.nnz, path_str, wfilename, time.time() - start)
def cal_comm_mat_UUB(path_str, cikm=False): ''' calculate commuting matrix for U-*-U-pos-B style ''' print "path str:", path_str uid_filename = dir_ + 'uids.txt' bid_filename = dir_ + 'bids.txt' upb_filename = dir_ + 'uid_pos_bid.txt' if not cikm: rid_filename = dir_ + 'rids.txt' aid_filename = dir_ + 'aids.txt' print 'cal commut mat for %s, filenames: %s, %s, %s' % (path_str, uid_filename, bid_filename, upb_filename) uids, uid2ind, ind2uid = load_eids(uid_filename, 'user') bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz') if not cikm: rids, rid2ind, ind2rid = load_eids(rid_filename, 'review') aids, aid2ind, ind2aid = load_eids(aid_filename, 'aspect') upb = np.loadtxt(upb_filename, dtype=np.int64) adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind) if path_str == 'UPBUB': start = time.time() UBU = adj_upb.dot(adj_upb_t) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str in ['UPBCatBUB', 'UPBCityBUB']: start = time.time() adj_bo, adj_bo_t = get_bo(path_str, bid2ind) UBO = adj_upb.dot(adj_bo) UBU = UBO.dot(UBO.transpose()) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str in ['UNBCatBUB', 'UNBCityBUB']: unb_filename = dir_ + 'uid_neg_bid.txt' unb = np.loadtxt(unb_filename, dtype=np.int64) adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind) start = time.time() adj_bo, adj_bo_t = get_bo(path_str, bid2ind) UBO = adj_unb.dot(adj_bo) UBU = UBO.dot(UBO.transpose()) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'UNBUB': unb_filename = dir_ + 'uid_neg_bid.txt' unb = np.loadtxt(unb_filename, dtype=np.int64) adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind) start = time.time() UBU = adj_unb.dot(adj_unb_t) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'UUB': social_filename = dir_ + 'user_social.txt' uu = np.loadtxt(social_filename, dtype=np.int64) adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind) start = time.time() UBU = adj_uu.copy() print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'UCompUB': uid_comp_filename = dir_ + 'uid_comp.txt' uc = np.loadtxt(uid_comp_filename, dtype=np.int64) cids = set(uc[:,1]) cid2ind = {v:k for k,v in enumerate(cids)} ind2cnd = reverse_map(cid2ind) adj_uc, adj_uc_t = generate_adj_mat(uc, uid2ind, cid2ind) start = time.time() UBU = adj_uc.dot(adj_uc_t) print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'URPARUB': urpa_filename = dir_ + 'uid_rid_pos_aid.txt' urpa = np.loadtxt(urpa_filename) ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind) ra = urpa[:,(1,2)] adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind) start = time.time() URA = adj_ur.dot(adj_ra) UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) elif path_str == 'URNARUB': urpa_filename = dir_ + 'uid_rid_neg_aid.txt' urpa = np.loadtxt(urpa_filename) ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind) ra = urpa[:,(1,2)] adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind) start = time.time() URA = adj_ur.dot(adj_ra) UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start) start = time.time() UBUB = UBU.dot(adj_upb) print 'UBUB(%s), density=%.5f cost %.2f seconds' % (UBUB.shape, UBUB.nnz * 1.0/UBUB.shape[0]/UBUB.shape[1], time.time() - start) start = time.time() K = 500 triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K) wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K) save_triplets(wfilename, triplets) #save_comm_res(path_str, wfilename, UBUB, ind2uid, ind2bid) print 'finish saving %s %s entries in %s, cost %.2f seconds' % (len(triplets), path_str, wfilename, time.time() - start)