コード例 #1
0
def cal_comm_mat_UBB(path_str):
    '''
        200k ratings
        calculate the commuting matrix in U-B-*-B style
        in fact, only need to calculate BB
    '''

    print "path str:", path_str

    uid_filename = dir_ + 'uids.txt'#users
    print 'run cal_comm_mat_samples for 10k users in ', uid_filename
    lines = open(uid_filename, 'r').readlines()
    uids = [int(l.strip()) for l in lines]
    uid2ind = {v:k for k,v in enumerate(uids)}
    ind2uid = reverse_map(uid2ind)

    bid_filename = dir_ + 'bids.txt'#items
    lines = open(bid_filename, 'r').readlines()
    bids = [int(l.strip()) for l in lines]
    bid2ind = {v:k for k,v in enumerate(bids)}
    ind2bid = reverse_map(bid2ind)

    upb_filename = dir_ + 'uid_pos_bid.txt'# positive rating
    upb = np.loadtxt(upb_filename, dtype=int)

    # generate users items adjacency matrix
    adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind)

    # print uid2ind[640698], bid2ind[51874]
    # print type(adj_ub), adj_ub.toarray()[uid2ind[640698]][bid2ind[51874]], adj_ub.toarray().shape
    # print type(adj_ub_t), adj_ub_t.toarray()[bid2ind[51874]][uid2ind[640698]], adj_ub_t.toarray().shape

    # generate items object adjacency matrix (cat, state, city, star)
    adj_bo, adj_bo_t = get_bo(path_str, bid2ind)

    t1 = time.time()
    # compute u-> b -> o(cat,city) <- b
    comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t)

    t2 = time.time()
    print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1)
    print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0/comm_res.shape[0]/comm_res.shape[1])
    K = 500
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K)

    save_triplets(wfilename, triplets)
    #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid)
    t3 = time.time()
    print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
コード例 #2
0
def output_result(f_output, model, datas, instanse_id, frame_wise=False):
    print('predict size:{}'.format(len(datas)))
    # mask of sentence len
    sents_len = []
    for data in datas:
        for frame_idx, vector in enumerate(data):
            if all(vector == 0.):
                sents_len.append(frame_idx)
                break
            if frame_idx + 1 == len(data):
                sents_len.append(frame_idx + 1)
    # predict
    preds = model.predict(datas)
    # transform
    preds = np.vectorize(reverse_map(phone2idx).get)(preds)
    preds = np.vectorize(phone2char.get)(preds)
    # output prediction
    import re
    print('output:{}'.format(f_output))
    with open(f_output, 'w') as out:
        _ = out.write('id,phone_sequence\n')
        for data_idx, pred in enumerate(preds):
            result_str = pred[:sents_len[data_idx]]
            # remove peak
            for i in range(1, len(result_str) - 1):
                if result_str[i - 1] == result_str[
                        i + 1] and result_str[i] != result_str[i - 1]:
                    result_str[i] = ''
            result_str = ''.join(result_str)
            if not frame_wise:
                result_str = result_str.strip(phone2char['sil'])  # trim sil
                result_str = re.sub(r'([a-zA-Z0-9])\1+', r'\1',
                                    result_str)  # trim
            _ = out.write('{},{}\n'.format(instanse_id[data_idx], result_str))
コード例 #3
0
def load_eids(eid_filename, type_):
    lines = open(eid_filename, 'r').readlines()
    eids = [int(l.strip()) for l in lines]
    eid2ind = {v:k for k,v in enumerate(eids)}
    ind2eid = reverse_map(eid2ind)
    #logger.info('get %s %s from %s', len(eids), type_, eid_filename)
    print 'get %s %s from %s' %(len(eids), type_, eid_filename)
    return eids, eid2ind, ind2eid
コード例 #4
0
def cal_rar(path_str):

    aid_filename = dir_ + 'aids.txt'
    rid_filename = dir_ + 'rids.txt'

    aids = open(aid_filename, 'r').readlines()
    aids = [int(r.strip()) for r in aids]
    aid2ind = {a: ind for ind, a in enumerate(aids)}  #global ind
    ind2aid = reverse_map(aid2ind)

    rids = open(rid_filename, 'r').readlines()
    rids = [int(r.strip()) for r in rids]
    rid2ind = {r: ind for ind, r in enumerate(rids)}  #global ind
    ind2rid = reverse_map(rid2ind)

    if 'P' in path_str:
        ura_filename = dir_ + 'uid_rid_pos_aid_weight.txt'
    elif 'N' in path_str:
        ura_filename = dir_ + 'uid_rid_neg_aid_weight.txt'

    ura = np.loadtxt(ura_filename, dtype=np.float64)
    ra = ura[:, (1, 2, 3)]
    ra = [(int(r), int(a), w) for r, a, w in ra]
    adj_ra, adj_ra_t = generate_adj_mat(ra, rid2ind, aid2ind, is_weight=True)

    t1 = time.time()
    RA = adj_ra.toarray()
    t2 = time.time()
    print 'to dense RA%s cost %.2f seconds' % (RA.shape, t2 - t1)
    RAR_csr = cal_rar_block(RA, len(rid2ind), ind2rid, step=20000)
    print 'finish cal rar by blocks, cost %.2f minutes' % (
        (time.time() - t2) / 60.0)
    try:
        wfilename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str
        fw = open(wfilename, 'w+')
        pickle.dump(RAR_csr, fw, pickle.HIGHEST_PROTOCOL)
        map_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str
        fw = open(map_filename, 'w+')
        pickle.dump(ind2rid, fw, pickle.HIGHEST_PROTOCOL)
        print 'finish saving sparse mat in ', wfilename
    except Exception as e:
        print e
コード例 #5
0
def cal_comm_mat_UBB(path_str):
    '''
        200k ratings
        calculate the commuting matrix in U-B-*-B style
        in fact, only need to calculate BB
    '''
    uid_filename = dir_ + 'uids.txt'
    print 'run cal_comm_mat_samples for 10k users in ', uid_filename
    lines = open(uid_filename, 'r').readlines()
    uids = [int(l.strip()) for l in lines]
    uid2ind = {v: k for k, v in enumerate(uids)}
    ind2uid = reverse_map(uid2ind)

    bid_filename = dir_ + 'bids.txt'
    lines = open(bid_filename, 'r').readlines()
    bids = [int(l.strip()) for l in lines]
    bid2ind = {v: k for k, v in enumerate(bids)}
    ind2bid = reverse_map(bid2ind)

    upb_filename = dir_ + 'uid_pos_bid.txt'
    upb = np.loadtxt(upb_filename, dtype=int)
    adj_ub, adj_ub_t = generate_adj_mat(upb, uid2ind, bid2ind)

    adj_bo, adj_bo_t = get_bo(path_str, bid2ind)

    t1 = time.time()
    comm_res = cal_mat_ubb(path_str, adj_ub, adj_bo, adj_bo_t)

    t2 = time.time()
    print 'cal res of %s cost %2.f seconds' % (path_str, t2 - t1)
    print 'comm_res shape=%s,densit=%s' % (comm_res.shape, comm_res.nnz * 1.0 /
                                           comm_res.shape[0] /
                                           comm_res.shape[1])
    K = 500
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    triplets = get_topK_items(comm_res, ind2uid, ind2bid, topK=K)
    save_triplets(wfilename, triplets)
    #batch_save_comm_res(path_str, wfilename, comm_res, ind2uid, ind2bid)
    t3 = time.time()
    print 'save res of %s cost %2.f seconds' % (path_str, t3 - t2)
コード例 #6
0
def get_bo(path_str, bid2ind):

    #U-pos-B-Cat-B
    if 'State' in path_str:
        sfilename = dir_ + 'bid_state.txt'
    elif 'Cat' in path_str:
        sfilename = dir_ + 'bid_cat.txt'
    elif 'City' in path_str:
        sfilename = dir_ + 'bid_city.txt'
    elif 'Star' in path_str:
        sfilename = dir_ + 'bid_stars.txt'

    lines = open(sfilename, 'r').readlines()
    parts = [l.strip().split() for l in lines]
    bos = [(int(b), int(o)) for b,o in parts]
    ond2ind = {v:k for k,v in enumerate(set([o for _, o in bos]))}
    ind2ond = reverse_map(ond2ind)
    adj_bo, adj_bo_t = generate_adj_mat(bos, bid2ind, ond2ind)
    return adj_bo, adj_bo_t
コード例 #7
0
def run(path_str, comb='', K=10):
    if path_str in ['ratings_only']:
        use_topK = False
    else:
        use_topK = True

    sim_filename = dir_ + 'sim_res/path_count/%s.res' % path_str
    if path_str == 'ratings_only':
        sim_filename = dir_ + 'ratings.txt'
    if use_topK:
        sim_filename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str,
                                                                   topK)
    if comb:
        sim_filename = dir_ + 'sim_res/path_count/combs/%s_%s_top%s.res' % (
            path_str, comb, topK)
    start_time = time.time()
    data = np.loadtxt(sim_filename)
    uids = set(data[:, 0].flatten())
    bids = set(data[:, 1].flatten())
    uid2ind = {int(v): k for k, v in enumerate(uids)}
    ind2uid = reverse_map(uid2ind)
    bid2ind = {int(v): k for k, v in enumerate(bids)}
    ind2bid = reverse_map(bid2ind)

    data[:, 0] = [uid2ind[int(r)] for r in data[:, 0]]
    data[:, 1] = [bid2ind[int(r)] for r in data[:, 1]]

    print 'finish load data from %s, cost %.2f seconds, users: %s, items=%s' % (
        sim_filename, time.time() - start_time, len(uids), len(bids))

    eps, lamb, iters = 10, 10, 500
    print 'start generate mf features, (K, eps, reg, iters) = (%s, %s, %s, %s)' % (
        K, eps, lamb, iters)
    mf = MF(data=data,
            train_data=data,
            test_data=[],
            K=K,
            eps=eps,
            lamb=lamb,
            max_iter=iters,
            call_logger=logger)
    U, V = mf.run()
    start_time = time.time()
    wfilename = dir_ + 'mf_features/path_count/%s_user.dat' % (path_str)
    rank_dir = dir_ + 'mf_features/path_count/ranks/%s/' % K
    if K != 10 and not os.path.isdir(rank_dir):
        os.makedirs(rank_dir)

    if use_topK:
        #wfilename = dir_ + 'mf_features/path_count/%s_top%s_user.dat' % (path_str, topK)
        wfilename = dir_ + 'mf_features/path_count/%s_top%s_user.dat' % (
            path_str, topK)
    else:
        wfilename = dir_ + 'mf_features/path_count/%s_user.dat' % (path_str)

    fw = open(wfilename, 'w+')
    res = []
    for ind, fs in enumerate(U):
        row = []
        row.append(ind2uid[ind])
        row.extend(fs.flatten())
        res.append('\t'.join([str(t) for t in row]))

    fw.write('\n'.join(res))
    fw.close()
    print 'User-Features: %s saved in %s, cost %.2f seconds' % (
        U.shape, wfilename, time.time() - start_time)

    start_time = time.time()
    wfilename = dir_ + 'mf_features/path_count/%s_item.dat' % (path_str)
    if use_topK:
        #wfilename = dir_ + 'mf_features/path_count/%s_top%s_item.dat' % (path_str, topK)
        wfilename = dir_ + 'mf_features/path_count/%s_top%s_item.dat' % (
            path_str, topK)
    else:
        wfilename = dir_ + 'mf_features/path_count/%s_item.dat' % (path_str)

    fw = open(wfilename, 'w+')
    res = []
    for ind, fs in enumerate(V):
        row = []
        row.append(ind2bid[ind])
        row.extend(fs.flatten())
        res.append('\t'.join([str(t) for t in row]))

    fw.write('\n'.join(res))
    fw.close()
    print 'Item-Features: %s  saved in %s, cost %.2f seconds' % (
        V.shape, wfilename, time.time() - start_time)
コード例 #8
0
def cal_comm_mat_USUB(path_str):
    '''
        Given meta_structure_str, generate the commuting matrix
        e.g. 'user-review-business,t10_aspect-review-user'
    '''

    uid_filename = dir_ + 'uids.txt'
    bid_filename = dir_ + 'bids.txt'
    aid_filename = dir_ + 'aids.txt'
    rid_filename = dir_ + 'rids.txt'
    upb_filename = dir_ + 'uid_pos_bid.txt'

    print 'cal commut mat for %s, filenames: %s, %s, %s' % (path_str, uid_filename, bid_filename, upb_filename)
    uids, uid2ind, ind2uid = load_eids(uid_filename, 'user')
    bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz')
    aids, aid2ind, ind2aid = load_eids(aid_filename, 'aspect')

    upb = np.loadtxt(upb_filename, dtype=np.int64)
    adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind)

    if 'P' in path_str:
        urb_filename = dir_ + 'uid_rid_pos_bid.txt'
        ura_filename = dir_ + 'uid_rid_pos_aid.txt'
        ind2rid_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str
        rar_mat_filename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str
    elif 'N' in path_str:
        urb_filename = dir_ + 'uid_rid_neg_bid.txt'
        ura_filename = dir_ + 'uid_rid_neg_aid.txt'
        ind2rid_filename = dir_ + 'sim_res/path_count/%s_spa_mat_id_map.pickle' % path_str
        rar_mat_filename = dir_ + 'sim_res/path_count/%s_spa_mat.pickle' % path_str

    f = open(ind2rid_filename, 'r')
    ind2rid = pickle.load(f)
    rid2ind = reverse_map(ind2rid)

    urb = np.loadtxt(urb_filename, dtype=np.int64)
    ura = np.loadtxt(ura_filename, dtype=np.int64)

    ur = urb[:,(0,1)]
    adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind)

    rb = urb[:,(1,2)]
    adj_rb, adj_rb_t = generate_adj_mat(rb, rid2ind, bid2ind)

    ra = ura[:,(1,2)]
    adj_ra, adj_ra_t = generate_adj_mat(ra, rid2ind, aid2ind)

    start = time.time()
    RBR = adj_rb.dot(adj_rb_t)
    print 'RBR(%s), density=%.5f cost %.2f seconds' % (RBR.shape, RBR.nnz * 1.0/RBR.shape[0]/RBR.shape[1], time.time() - start)
    start = time.time()
    #RAR = adj_ra.dot(adj_ra_t)
    f = open(rar_mat_filename, 'r')
    RAR = pickle.load(f)
    print 'load RAR(%s), density=%.5f cost %.2f seconds' % (RAR.shape, RAR.nnz * 1.0/RAR.shape[0]/RAR.shape[1], time.time() - start)
    start = time.time()
    RSR = RBR.multiply(RAR)
    print 'RSR(%s), density=%.5f cost %.2f seconds' % (RSR.shape, RSR.nnz * 1.0/RSR.shape[0]/RSR.shape[1], time.time() - start)
    start = time.time()
    URSR = adj_ur.dot(RSR)
    print 'URSR(%s), density=%.5f cost %.2f seconds' % (URSR.shape, URSR.nnz * 1.0/URSR.shape[0]/URSR.shape[1], time.time() - start)
    start = time.time()
    URSRU = URSR.dot(adj_ur_t)
    print 'URSRU(%s), density=%.5f cost %.2f seconds' % (URSRU.shape, URSRU.nnz * 1.0/URSRU.shape[0]/URSRU.shape[1], time.time() - start)

    start = time.time()
    URSRUB = URSRU.dot(adj_upb)
    print 'URSRUB(%s), density=%.5f cost %.2f seconds' % (URSRUB.shape, URSRUB.nnz * 1.0/URSRUB.shape[0]/URSRUB.shape[1], time.time() - start)
    start = time.time()
    K = 500
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    #wfilename = dir_ + 'sim_res/path_count/%s.res' % path_str
    batch_save_comm_res(path_str, wfilename, URSRUB, ind2uid, ind2bid)
    print 'finish saving %s %s entries in %s, cost %.2f seconds' % (URSRUB.nnz, path_str, wfilename, time.time() - start)
コード例 #9
0
def cal_comm_mat_UUB(path_str, cikm=False):
    '''
        calculate commuting matrix for U-*-U-pos-B style
    '''
    print "path str:", path_str

    uid_filename = dir_ + 'uids.txt'
    bid_filename = dir_ + 'bids.txt'
    upb_filename = dir_ + 'uid_pos_bid.txt'
    if not cikm:
        rid_filename = dir_ + 'rids.txt'
        aid_filename = dir_ + 'aids.txt'

    print 'cal commut mat for %s, filenames: %s, %s, %s' % (path_str, uid_filename, bid_filename, upb_filename)
    uids, uid2ind, ind2uid = load_eids(uid_filename, 'user')
    bids, bid2ind, ind2bid = load_eids(bid_filename, 'biz')
    if not cikm:
        rids, rid2ind, ind2rid = load_eids(rid_filename, 'review')
        aids, aid2ind, ind2aid = load_eids(aid_filename, 'aspect')

    upb = np.loadtxt(upb_filename, dtype=np.int64)
    adj_upb, adj_upb_t = generate_adj_mat(upb, uid2ind, bid2ind)

    if path_str == 'UPBUB':
        start = time.time()
        UBU = adj_upb.dot(adj_upb_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str in ['UPBCatBUB', 'UPBCityBUB']:
        start = time.time()
        adj_bo, adj_bo_t = get_bo(path_str, bid2ind)
        UBO = adj_upb.dot(adj_bo)
        UBU = UBO.dot(UBO.transpose())
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str in ['UNBCatBUB', 'UNBCityBUB']:
        unb_filename = dir_ + 'uid_neg_bid.txt'
        unb = np.loadtxt(unb_filename, dtype=np.int64)
        adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind)

        start = time.time()
        adj_bo, adj_bo_t = get_bo(path_str, bid2ind)
        UBO = adj_unb.dot(adj_bo)
        UBU = UBO.dot(UBO.transpose())
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UNBUB':
        unb_filename = dir_ + 'uid_neg_bid.txt'
        unb = np.loadtxt(unb_filename, dtype=np.int64)
        adj_unb, adj_unb_t = generate_adj_mat(unb, uid2ind, bid2ind)

        start = time.time()
        UBU = adj_unb.dot(adj_unb_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UUB':
        social_filename = dir_ + 'user_social.txt'
        uu = np.loadtxt(social_filename, dtype=np.int64)
        adj_uu, adj_uu_t = generate_adj_mat(uu, uid2ind, uid2ind)

        start = time.time()
        UBU = adj_uu.copy()
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'UCompUB':
        uid_comp_filename = dir_ + 'uid_comp.txt'
        uc = np.loadtxt(uid_comp_filename, dtype=np.int64)
        cids = set(uc[:,1])
        cid2ind = {v:k for k,v in enumerate(cids)}
        ind2cnd = reverse_map(cid2ind)
        adj_uc, adj_uc_t = generate_adj_mat(uc, uid2ind, cid2ind)

        start = time.time()
        UBU = adj_uc.dot(adj_uc_t)
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'URPARUB':
        urpa_filename = dir_ + 'uid_rid_pos_aid.txt'
        urpa = np.loadtxt(urpa_filename)
        ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate
        adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind)
        ra = urpa[:,(1,2)]
        adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind)

        start = time.time()
        URA = adj_ur.dot(adj_ra)
        UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    elif path_str == 'URNARUB':
        urpa_filename = dir_ + 'uid_rid_neg_aid.txt'
        urpa = np.loadtxt(urpa_filename)
        ur = list(set([(u,r) for u, r in urpa[:,(0,1)]]))# u, r multiple aspects, thus u-r can be duplicate
        adj_ur, adj_ur_t = generate_adj_mat(ur, uid2ind, rid2ind)
        ra = urpa[:,(1,2)]
        adj_ra, adj_ua_t = generate_adj_mat(ra, rid2ind, aid2ind)

        start = time.time()
        URA = adj_ur.dot(adj_ra)
        UBU = URA.dot(URA.transpose())#it should be URARU, here we use UBU for convenience
        print 'UBU(%s), density=%.5f cost %.2f seconds' % (UBU.shape, UBU.nnz * 1.0/UBU.shape[0]/UBU.shape[1], time.time() - start)

    start = time.time()
    UBUB = UBU.dot(adj_upb)
    print 'UBUB(%s), density=%.5f cost %.2f seconds' % (UBUB.shape, UBUB.nnz * 1.0/UBUB.shape[0]/UBUB.shape[1], time.time() - start)
    start = time.time()
    K = 500
    triplets = get_topK_items(UBUB, ind2uid, ind2bid, topK=K)
    wfilename = dir_ + 'sim_res/path_count/%s_top%s.res' % (path_str, K)
    save_triplets(wfilename, triplets)
    #save_comm_res(path_str, wfilename, UBUB, ind2uid, ind2bid)
    print 'finish saving %s %s entries in %s, cost %.2f seconds' % (len(triplets), path_str, wfilename, time.time() - start)
コード例 #10
0
def run(path_str, K=10):
    if path_str in ['ratings_only']:
        use_topK = False
    else:
        use_topK = True

    sim_filename = os.path.join(data_dir,
                                'sim_res/path_count/%s.res' % path_str)
    if path_str == 'ratings_only':
        sim_filename = os.path.join(data_dir, 'tuples/ratings.txt')
    elif use_topK:
        sim_filename = os.path.join(
            data_dir, 'sim_res/path_count/%s_top%s.res' % (path_str, topK))

    start_time = time.time()
    data = np.loadtxt(sim_filename, dtype=np.str, delimiter="\t")
    uids = set(data[:, 0].flatten())
    bids = set(data[:, 1].flatten())
    # uid2ind = {v: k for k, v in enumerate(uids)}
    uid2ind = {int(v): k for k, v in enumerate(uids)}
    ind2uid = reverse_map(uid2ind)
    # bid2ind = {v: k for k, v in enumerate(bids)}
    bid2ind = {int(v): k for k, v in enumerate(bids)}
    ind2bid = reverse_map(bid2ind)

    data[:, 0] = [uid2ind[int(r)] for r in data[:, 0]]
    data[:, 1] = [bid2ind[int(r)] for r in data[:, 1]]

    # data[:, 0] = [uid2ind[r] for r in data[:, 0]]
    # data[:, 1] = [bid2ind[r] for r in data[:, 1]]

    print('finish load data from %s, cost %.2f seconds, users: %s, items=%s' %
          (sim_filename, time.time() - start_time, len(uids), len(bids)))
    # must convert data type to float
    data = data.astype(dtype=np.float)
    print("data shape: ", data.shape, data.dtype)

    eps, lamb, iters = 10, 10, 500
    print(
        'start generate mf features, (K, eps, reg, iters) = (%s, %s, %s, %s)' %
        (K, eps, lamb, iters))
    mf = MF(data=data,
            train_data=data,
            test_data=[],
            K=K,
            eps=eps,
            lamb=lamb,
            max_iter=iters,
            call_logger=logger)
    U, V = mf.run()

    start_time = time.time()
    wfilename = os.path.join(data_dir,
                             'mf_features/path_count/%s_user.dat' % (path_str))
    if use_topK:
        wfilename = os.path.join(
            data_dir,
            'mf_features/path_count/%s_top%s_user.dat' % (path_str, topK))

    fw = open(wfilename, 'w+')
    res = []
    for ind, fs in enumerate(U):
        row = []
        row.append(ind2uid[ind])
        row.extend(fs.flatten())
        res.append('\t'.join([str(t) for t in row]))

    fw.write('\n'.join(res))
    fw.close()
    print('User-Features: %s saved in %s, cost %.2f seconds' %
          (U.shape, wfilename, time.time() - start_time))

    start_time = time.time()
    wfilename = os.path.join(data_dir,
                             'mf_features/path_count/%s_item.dat' % (path_str))
    if use_topK:
        wfilename = os.path.join(
            data_dir,
            'mf_features/path_count/%s_top%s_item.dat' % (path_str, topK))

    fw = open(wfilename, 'w+')
    res = []
    for ind, fs in enumerate(V):
        row = []
        row.append(ind2bid[ind])
        row.extend(fs.flatten())
        res.append('\t'.join([str(t) for t in row]))

    fw.write('\n'.join(res))
    fw.close()
    print('Item-Features: %s  saved in %s, cost %.2f seconds' %
          (V.shape, wfilename, time.time() - start_time))