Esempio n. 1
0
def cal_comm_mat_UBB(path_str):
    '''
        200k ratings
        calculate the commuting matrix in U-B-*-B style
        in fact, only need to calculate BB
    '''

    print "path str:", path_str

    uid_filename = dir_ + 'uids.txt'#users
    print 'run cal_comm_mat_samples for 10k users in ', uid_filename
    lines = open(uid_filename, 'r').readlines()
    uids = [int(l.strip()) for l in lines]
    uid2ind = {v:k for k,v in enumerate(uids)}
    ind2uid = reverse_map(uid2ind)

    bid_filename = dir_ + 'bids.txt'#items
    lines = open(bid_filename, 'r').readlines()
    bids = [int(l.strip()) for l in lines]
    bid2ind = {v:k for k,v in enumerate(bids)}
    ind2bid = reverse_map(bid2ind)

    upb_filename = dir_ + 'uid_pos_bid.txt'# positive rating
    upb = np.loadtxt(upb_filename, dtype=int)

    # generate users items adjacency matrix
    adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind)

    # print uid2ind[640698], bid2ind[51874]
    # print type(adj_ub), adj_ub.toarray()[uid2ind[640698]][bid2ind[51874]], adj_ub.toarray().shape
    # print type(adj_ub_t), adj_ub_t.toarray()[bid2ind[51874]][uid2ind[640698]], adj_ub_t.toarray().shape

    # generate items object adjacency matrix (cat, state, city, star)
    adj_bo, adj_bo_t = get_bo(path_str, bid2ind)

    t1 = time.time()
    # compute u-> b -> o(cat,city) <- b
    comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t)

    t2 = time.time()
    print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1)
    print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0/comm_res.shape[0]/comm_res.shape[1])
    K = 500
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K)

    save_triplets(wfilename, triplets)
    #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid)
    t3 = time.time()
    print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
Esempio n. 2
0
def cal_comm_mat_sm(path_str):
    '''
        calculate commuting matrix for U-*-U-pos-B style in merge way with 7 simple motifs (sm)
    '''
    uid_filename = dir_ + 'uids.txt'
    bid_filename = dir_ + 'bids.txt'
    ub_filename = dir_ + 'uid_bid.txt'

    print 'cal commut mat with motif for %s, filenames: %s, %s, %s' % (
        path_str, uid_filename, bid_filename, ub_filename)
    uids, uid2ind, ind2uid = load_eids(uid_filename, 'user')
    bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz')

    # upb = np.loadtxt(upb_filename, dtype=np.int64)
    ub = np.loadtxt(ub_filename, dtype=np.int64)

    # adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind)
    adj_ub, adj_ub_t = generate_adj_mat(ub, uid2ind, bid2ind)

    social_filename = dir_ + 'user_social.txt'
    uu = np.loadtxt(social_filename, dtype=np.int64)
    adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind)

    motif_matrix = compute_motif_matrix(adj_uu, adj_uu_t, path_str)

    if path_str[:3] == 'UUB':
        base_matrix = adj_uu

    if path_str[:4] == 'UBUB':
        base_matrix = adj_ub.dot(adj_ub_t)

    #for n in range(1, 10):
    for n in range(11):
        alpha = n * 0.1
        UBU_merge = (1 - alpha) * base_matrix + alpha * motif_matrix
        start = time.time()
        UBUB = UBU_merge.dot(adj_ub)
        print 'UBUB(%s), density=%.5f cost %.2f seconds' % (
            UBUB.shape, UBUB.nnz * 1.0 / UBUB.shape[0] / UBUB.shape[1],
            time.time() - start)
        start = time.time()
        K = 500

        #normal way
        triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K)
        wfilename = dir_ + 'sim_res/path_count/%s_%s_top%s.res' % (path_str,
                                                                   alpha, K)
        save_triplets(wfilename, triplets)
        print 'finish saving %s %s entries in %s, cost %.2f seconds' % (
            len(triplets), path_str, wfilename, time.time() - start)
Esempio n. 3
0
def batch_save_comm_res(path_str, wfilename, comm_res, ind2row, ind2col):
    coo = comm_res.tocoo(copy=False)
    step = 10000000
    N = len(coo.row) / step
    for i in range(N+1):
        start_time = time.time()
        triplets = []
        start = i * step
        end = start + step
        rows = coo.row[start:end]
        cols = coo.col[start:end]
        vs = coo.data[start:end]
        for r, c, v in zip(rows, cols, vs):
            triplets.append((ind2row[r], ind2col[c], v))
        save_triplets(wfilename, triplets, is_append=True)
        print 'finish saving 10M %s triplets in %s, progress: %s/%s, cost %.2f seconds' % (path_str, wfilename, (i+1) * step, len(coo.data), time.time() - start_time)
Esempio n. 4
0
def cal_comm_mat_UBB(path_str):
    '''
        200k ratings
        calculate the commuting matrix in U-B-*-B style
        in fact, only need to calculate BB
    '''
    uid_filename = dir_ + 'uids.txt'
    print 'run cal_comm_mat_samples for 10k users in ', uid_filename
    lines = open(uid_filename, 'r').readlines()
    uids = [int(l.strip()) for l in lines]
    uid2ind = {v: k for k, v in enumerate(uids)}
    ind2uid = reverse_map(uid2ind)

    bid_filename = dir_ + 'bids.txt'
    lines = open(bid_filename, 'r').readlines()
    bids = [int(l.strip()) for l in lines]
    bid2ind = {v: k for k, v in enumerate(bids)}
    ind2bid = reverse_map(bid2ind)

    upb_filename = dir_ + 'uid_pos_bid.txt'
    upb = np.loadtxt(upb_filename, dtype=int)
    adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind)

    adj_bo, adj_bo_t = get_bo(path_str, bid2ind)

    t1 = time.time()
    comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t)

    t2 = time.time()
    print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1)
    print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0 /
                                           comm_res.shape[0] /
                                           comm_res.shape[1])
    K = 500
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K)
    save_triplets(wfilename, triplets)
    #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid)
    t3 = time.time()
    print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
Esempio n. 5
0
def save_comm_res(path_str, filename, comm_res, ind2row, ind2col):
    triplets = []
    coo = comm_res.tocoo()
    for r, c, v in zip(coo.row, coo.col,coo.data):
        triplets.append((ind2row[r], ind2col[c], v))
    save_triplets(filename, triplets)
Esempio n. 6
0
def cal_comm_mat_UUB(path_str, cikm=False):
    '''
        calculate commuting matrix for U-*-U-pos-B style
    '''
    print "path str:", path_str

    uid_filename = dir_ + 'uids.txt'
    bid_filename = dir_ + 'bids.txt'
    upb_filename = dir_ + 'uid_pos_bid.txt'
    if not cikm:
        rid_filename = dir_ + 'rids.txt'
        aid_filename = dir_ + 'aids.txt'

    print 'cal commut mat for %s, filenames: %s, %s, %s' % (path_str, uid_filename, bid_filename, upb_filename)
    uids, uid2ind, ind2uid = load_eids(uid_filename, 'user')
    bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz')
    if not cikm:
        rids, rid2ind, ind2rid = load_eids(rid_filename, 'review')
        aids, aid2ind, ind2aid = load_eids(aid_filename, 'aspect')

    upb = np.loadtxt(upb_filename, dtype=np.int64)
    adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind)

    if path_str == 'UPBUB':
        start = time.time()
        UBU = adj_upb.dot(adj_upb_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str in ['UPBCatBUB', 'UPBCityBUB']:
        start = time.time()
        adj_bo, adj_bo_t = get_bo(path_str, bid2ind)
        UBO = adj_upb.dot(adj_bo)
        UBU = UBO.dot(UBO.transpose())
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str in ['UNBCatBUB', 'UNBCityBUB']:
        unb_filename = dir_ + 'uid_neg_bid.txt'
        unb = np.loadtxt(unb_filename, dtype=np.int64)
        adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind)

        start = time.time()
        adj_bo, adj_bo_t = get_bo(path_str, bid2ind)
        UBO = adj_unb.dot(adj_bo)
        UBU = UBO.dot(UBO.transpose())
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UNBUB':
        unb_filename = dir_ + 'uid_neg_bid.txt'
        unb = np.loadtxt(unb_filename, dtype=np.int64)
        adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind)

        start = time.time()
        UBU = adj_unb.dot(adj_unb_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UUB':
        social_filename = dir_ + 'user_social.txt'
        uu = np.loadtxt(social_filename, dtype=np.int64)
        adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind)

        start = time.time()
        UBU = adj_uu.copy()
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UCompUB':
        uid_comp_filename = dir_ + 'uid_comp.txt'
        uc = np.loadtxt(uid_comp_filename, dtype=np.int64)
        cids = set(uc[:,1])
        cid2ind = {v:k for k,v in enumerate(cids)}
        ind2cnd = reverse_map(cid2ind)
        adj_uc, adj_uc_t = generate_adj_mat(uc, uid2ind, cid2ind)

        start = time.time()
        UBU = adj_uc.dot(adj_uc_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'URPARUB':
        urpa_filename = dir_ + 'uid_rid_pos_aid.txt'
        urpa = np.loadtxt(urpa_filename)
        ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate
        adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind)
        ra = urpa[:,(1,2)]
        adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind)

        start = time.time()
        URA = adj_ur.dot(adj_ra)
        UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'URNARUB':
        urpa_filename = dir_ + 'uid_rid_neg_aid.txt'
        urpa = np.loadtxt(urpa_filename)
        ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate
        adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind)
        ra = urpa[:,(1,2)]
        adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind)

        start = time.time()
        URA = adj_ur.dot(adj_ra)
        UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    start = time.time()
    UBUB = UBU.dot(adj_upb)
    print 'UBUB(%s), density=%.5f cost %.2f seconds' % (UBUB.shape, UBUB.nnz * 1.0/UBUB.shape[0]/UBUB.shape[1], time.time() - start)
    start = time.time()
    K = 500
    triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K)
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    save_triplets(wfilename, triplets)
    #save_comm_res(path_str, wfilename, UBUB, ind2uid, ind2bid)
    print 'finish saving %s %s entries in %s, cost %.2f seconds' % (len(triplets), path_str, wfilename, time.time() - start)