예제 #1
0
def compute_cfss():
    '''
    计算shop-shop相似关系矩阵。
    Input:
        shop_actu:用户对店铺做的动作
    Process: 
        取用户动作表示的shop向量,计算向量点积。
    Output:
        shop-shop 相似关系,cfss.kv
    '''
    # shop_actu -> shop-shop关系矩阵,并保存cfss.kv,shop\tshop:weight;
    kvg = KVEngine()
    kvg.load([full_path('shop_actu.kv')])

    # get normialized vectors
    shop_users = {}
    skeys = kvg.keymatch('S\d+_ACTU')
    for skey in skeys:
        sid = key_id(skey)
        vector = dict([(int(key), float(value))
                       for (key, value) in kvg.getd(skey).items()
                       if key and value])
        # tailor to top 20
        items = vector.items()
        items.sort(key=lambda x: x[1], reverse=True)
        items = items[:20]
        vector = dict(items)
        normalize(vector)
        shop_users[sid] = vector

    # similarity calculation
    shop_similarity = {}
    sids = shop_users.keys()
    sids.sort()
    l = len(sids)
    print "Calculating shop-shop similarity matrix, total %d..." % l
    for i in range(l):
        if i % 1000 == 0:
            print "%d" % i
            sys.stdout.flush()
        for j in range(i + 1, l):
            sim = norm_dot_product(shop_users[sids[i]], shop_users[sids[j]])
            if abs(sim) < 1e-5:
                continue
            shop_similarity.setdefault(sids[i], {})[sids[j]] = sim
            shop_similarity.setdefault(sids[j], {})[sids[i]] = sim

    # save as kvfile
    write_kv_dict(shop_similarity, 'S%s_CFSIMS', 'cfss.kv')
예제 #2
0
def compute_cfgg():
    '''
    计算goods-goods相似关系矩阵。
    Input:
        user_actg.kv -> goods_actu.kv:用户对店铺做的动作
    Process: 
        取用户动作表示的goods向量,计算向量点积。
    Output:
        goods-goods 相似关系,cfss.kv
    '''
    kvg = KVEngine()
    kvg.load([full_path('goods_actu.kv')])

    # get normialized vectors
    goods_users = {}
    gkeys = kvg.keymatch('G\d+_ACTU')
    for gkey in gkeys:
        gid = key_id(gkey)
        vector = dict([(int(key), float(value))
                       for (key, value) in kvg.getd(gkey).items()
                       if key and value])
        # tailor to top 20
        items = vector.items()
        items.sort(key=lambda x: x[1], reverse=True)
        items = items[:20]
        vector = dict(items)
        normalize(vector)
        goods_users[gid] = vector

    # similarity calculation
    goods_similarity = {}
    gids = goods_users.keys()
    gids.sort()
    l = len(gids)
    print "Calculating goods-goods similarity matrix, total %d..." % l
    for i in range(l):
        if i % 100 == 0:
            print "%d" % i
            sys.stdout.flush()
        for j in range(i + 1, l):
            sim = norm_dot_product(goods_users[gids[i]], goods_users[gids[j]])
            if abs(sim) < 1e-5:
                continue
            goods_similarity.setdefault(gids[i], {})[gids[j]] = sim
            goods_similarity.setdefault(gids[j], {})[gids[i]] = sim

    # save as kvfile
    write_kv_dict(goods_similarity, 'G%s_CFSIMG', 'cfgg.kv')
예제 #3
0
def compute_cfss():
    '''
    计算shop-shop相似关系矩阵。
    Input:
        shop_actu:用户对店铺做的动作
    Process: 
        取用户动作表示的shop向量,计算向量点积。
    Output:
        shop-shop 相似关系,cfss.kv
    '''
    # shop_actu -> shop-shop关系矩阵,并保存cfss.kv,shop\tshop:weight;
    kvg = KVEngine()
    kvg.load([full_path('shop_actu.kv')])

    # get normialized vectors
    shop_users = {}
    skeys = kvg.keymatch('S\d+_ACTU')
    for skey in skeys:
        sid = key_id(skey)
        vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(skey).items() if key and value])
        # tailor to top 20
        items = vector.items()
        items.sort(key=lambda x:x[1], reverse=True)
        items = items[:20]
        vector = dict(items)
        normalize(vector)
        shop_users[sid] = vector

    # similarity calculation
    shop_similarity = {}
    sids = shop_users.keys()
    sids.sort()
    l = len(sids)
    print "Calculating shop-shop similarity matrix, total %d..." % l
    for i in range(l):
        if i % 1000 == 0:
            print "%d" % i
            sys.stdout.flush()
        for j in range(i+1, l):
            sim = norm_dot_product(shop_users[sids[i]], shop_users[sids[j]])
            if abs(sim) < 1e-5:
                continue
            shop_similarity.setdefault(sids[i], {})[sids[j]] = sim
            shop_similarity.setdefault(sids[j], {})[sids[i]] = sim

    # save as kvfile
    write_kv_dict(shop_similarity, 'S%s_CFSIMS', 'cfss.kv')
예제 #4
0
def compute_cfgg():
    '''
    计算goods-goods相似关系矩阵。
    Input:
        user_actg.kv -> goods_actu.kv:用户对店铺做的动作
    Process: 
        取用户动作表示的goods向量,计算向量点积。
    Output:
        goods-goods 相似关系,cfss.kv
    '''
    kvg = KVEngine()
    kvg.load([full_path('goods_actu.kv')])

    # get normialized vectors
    goods_users = {}
    gkeys = kvg.keymatch('G\d+_ACTU')
    for gkey in gkeys:
        gid = key_id(gkey)
        vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(gkey).items() if key and value])
        # tailor to top 20
        items = vector.items()
        items.sort(key=lambda x:x[1], reverse=True)
        items = items[:20]
        vector = dict(items)
        normalize(vector)
        goods_users[gid] = vector

    # similarity calculation
    goods_similarity = {}
    gids = goods_users.keys()
    gids.sort()
    l = len(gids)
    print "Calculating goods-goods similarity matrix, total %d..." % l
    for i in range(l):
        if i % 100 == 0:
            print "%d" % i
            sys.stdout.flush()
        for j in range(i+1, l):
            sim = norm_dot_product(goods_users[gids[i]], goods_users[gids[j]])
            if abs(sim) < 1e-5:
                continue
            goods_similarity.setdefault(gids[i], {})[gids[j]] = sim
            goods_similarity.setdefault(gids[j], {})[gids[i]] = sim

    # save as kvfile
    write_kv_dict(goods_similarity, 'G%s_CFSIMG', 'cfgg.kv')
예제 #5
0
def worker(tasks, foutput, lock, groups):
    global rows, cid2rids
    name = current_process().name
    buf = StringIO()

    for no, rid in enumerate(tasks):
        if no % 100 == 0:
            lock.acquire()
            foutput.write(buf.getvalue())
            foutput.flush()
            buf.truncate(0)
            lock.release()

        cids = rows[rid].keys()
        rrids = set()
        for cid in cids:
            rrids.update(cid2rids[cid])
        rrids.remove(rid)
        # remove rrids from different category
        rgroups = groups.get(rid, set())
        rrid2simi = {}
        small_rrids = set()
        for rrid in rrids:
            if not groups.get(rrid, set()).intersection(rgroups):
                continue
            small_rrids.add(rrid)
        #print >> sys.stderr, 'rrids: %d -> %d' % (len(rrids), len(small_rrids))
        if not small_rrids:
            continue
        rrids = small_rrids
        for rrid in rrids:
            rrid2simi[rrid] = norm_dot_product(rows[rid], rows[rrid])
        items = rrid2simi.items()
        items.sort(key=lambda x: x[1], reverse=True)
        items = [item for item in items if item[1] >= 0.1][:40]
        buf.write('%d %s\n' %
                  (rid, ' '.join(['%d:%.4f' % item for item in items])))
    else:
        if buf.tell() != 0:
            lock.acquire()
            foutput.write(buf.getvalue())
            foutput.flush()
            buf.truncate(0)
            lock.release()
예제 #6
0
def worker(tasks, foutput, lock, groups):
    global rows, cid2rids
    name = current_process().name
    buf = StringIO()

    for no, rid in enumerate(tasks):
        if no % 100 == 0:
            lock.acquire()
            foutput.write(buf.getvalue())
            foutput.flush()
            buf.truncate(0)
            lock.release()

        cids = rows[rid].keys()
        rrids = set()
        for cid in cids:
            rrids.update(cid2rids[cid])
        rrids.remove(rid)
        # remove rrids from different category
        rgroups = groups.get(rid, set())
        rrid2simi = {}
        small_rrids = set()
        for rrid in rrids:
            if not groups.get(rrid, set()).intersection(rgroups):
                continue
            small_rrids.add(rrid)
        #print >> sys.stderr, 'rrids: %d -> %d' % (len(rrids), len(small_rrids))
        if not small_rrids:
            continue
        rrids = small_rrids
        for rrid in rrids:
            rrid2simi[rrid] = norm_dot_product(rows[rid], rows[rrid])
        items = rrid2simi.items()
        items.sort(key=lambda x:x[1], reverse=True)
        items = [item for item in items if item[1]>=0.1][:40]
        buf.write('%d %s\n' % (rid, ' '.join(['%d:%.4f' % item for item in items])))
    else:
        if buf.tell() != 0:
            lock.acquire()
            foutput.write(buf.getvalue())
            foutput.flush()
            buf.truncate(0)
            lock.release()