Beispiel #1
0
def main():
    if len(sys.argv) != 6:
        print 'Usage: <goods.simi> <goods.docinfo> <goods.name> <name.me> <goods.simi.output>'
        sys.exit(0)

    print 'docinfo...'; sys.stdout.flush()
    gidx2id = read_docinfo(sys.argv[2])
    print 'simi...'; sys.stdout.flush()
    gid2simi = read_simi(sys.argv[1], gidx2id)
    print 'name...'; sys.stdout.flush()
    gid2name = read_name(sys.argv[3])
    print 'name.me...'; sys.stdout.flush()
    name_me = read_me(sys.argv[4])
    kvg = KVEngine()
    print 'binfo...'; sys.stdout.flush()
    kvg.load([full_path('goods_binfo.kv')])
    kvg.load([full_path('goods_price.kv')])

    print 'shop limit...'; sys.stdout.flush()
    same_shop_limit(gid2simi, kvg)
    print 'mutually_exclusive_names...'; sys.stdout.flush()
    mutually_exclusive_names(gid2simi, gid2name, name_me)
    print 'price adjustment...'; sys.stdout.flush()
    price_adj(gid2simi, kvg)

    print 'output...'; sys.stdout.flush()
    output(sys.argv[5], gid2simi)
Beispiel #2
0
def main():
    if len(sys.argv) != 4:
        print "Usage: <goods.simi> <goods.docinfo> <output.html>"
        sys.exit(0)

    goods_simi = read_simi(sys.argv[1])
    kvg = KVEngine()
    kvg.load([full_path("goods_binfo.kv")])
    kvg.load([full_path("goods_price.kv")])
    kvg.load([full_path("goods_cat.kv")])

    html = StringIO()
    html.write('<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>')

    case = 0
    for gid in goods_simi:
        title, imgurl, taobaourl, shopid, price, category = ginfo(kvg, gid)
        html.write(main_snippet(case, gid, title, imgurl, taobaourl, shopid, price, category))
        case += 1
        items = goods_simi[gid].items()
        items.sort(key=lambda x: x[1], reverse=True)
        for subcase, item in enumerate(items):
            rgid, weight = item
            rtitle, rimgurl, rtaobaourl, shopid, price, category = ginfo(kvg, rgid)
            html.write(sub_snippet(subcase, rgid, rtitle, rimgurl, rtaobaourl, weight, shopid, price, category))
            subcase += 1
            if subcase >= 10:
                break

    open(sys.argv[3], "w").write(html.getvalue())
Beispiel #3
0
def compute_cfss():
    '''
    计算shop-shop相似关系矩阵。
    Input:
        shop_actu:用户对店铺做的动作
    Process: 
        取用户动作表示的shop向量,计算向量点积。
    Output:
        shop-shop 相似关系,cfss.kv
    '''
    # shop_actu -> shop-shop关系矩阵,并保存cfss.kv,shop\tshop:weight;
    kvg = KVEngine()
    kvg.load([full_path('shop_actu.kv')])

    # get normialized vectors
    shop_users = {}
    skeys = kvg.keymatch('S\d+_ACTU')
    for skey in skeys:
        sid = key_id(skey)
        vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(skey).items() if key and value])
        # tailor to top 20
        items = vector.items()
        items.sort(key=lambda x:x[1], reverse=True)
        items = items[:20]
        vector = dict(items)
        normalize(vector)
        shop_users[sid] = vector

    # similarity calculation
    shop_similarity = {}
    sids = shop_users.keys()
    sids.sort()
    l = len(sids)
    print "Calculating shop-shop similarity matrix, total %d..." % l
    for i in range(l):
        if i % 1000 == 0:
            print "%d" % i
            sys.stdout.flush()
        for j in range(i+1, l):
            sim = norm_dot_product(shop_users[sids[i]], shop_users[sids[j]])
            if abs(sim) < 1e-5:
                continue
            shop_similarity.setdefault(sids[i], {})[sids[j]] = sim
            shop_similarity.setdefault(sids[j], {})[sids[i]] = sim

    # save as kvfile
    write_kv_dict(shop_similarity, 'S%s_CFSIMS', 'cfss.kv')
Beispiel #4
0
def compute_cfgg():
    '''
    计算goods-goods相似关系矩阵。
    Input:
        user_actg.kv -> goods_actu.kv:用户对店铺做的动作
    Process: 
        取用户动作表示的goods向量,计算向量点积。
    Output:
        goods-goods 相似关系,cfss.kv
    '''
    kvg = KVEngine()
    kvg.load([full_path('goods_actu.kv')])

    # get normialized vectors
    goods_users = {}
    gkeys = kvg.keymatch('G\d+_ACTU')
    for gkey in gkeys:
        gid = key_id(gkey)
        vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(gkey).items() if key and value])
        # tailor to top 20
        items = vector.items()
        items.sort(key=lambda x:x[1], reverse=True)
        items = items[:20]
        vector = dict(items)
        normalize(vector)
        goods_users[gid] = vector

    # similarity calculation
    goods_similarity = {}
    gids = goods_users.keys()
    gids.sort()
    l = len(gids)
    print "Calculating goods-goods similarity matrix, total %d..." % l
    for i in range(l):
        if i % 100 == 0:
            print "%d" % i
            sys.stdout.flush()
        for j in range(i+1, l):
            sim = norm_dot_product(goods_users[gids[i]], goods_users[gids[j]])
            if abs(sim) < 1e-5:
                continue
            goods_similarity.setdefault(gids[i], {})[gids[j]] = sim
            goods_similarity.setdefault(gids[j], {})[gids[i]] = sim

    # save as kvfile
    write_kv_dict(goods_similarity, 'G%s_CFSIMG', 'cfgg.kv')
Beispiel #5
0
def main():
    if len(sys.argv) != 6:
        print 'Usage: <goods.simi> <goods.docinfo> <goods.name> <name.me> <goods.simi.output>'
        sys.exit(0)

    print 'docinfo...'
    sys.stdout.flush()
    gidx2id = read_docinfo(sys.argv[2])
    print 'simi...'
    sys.stdout.flush()
    gid2simi = read_simi(sys.argv[1], gidx2id)
    print 'name...'
    sys.stdout.flush()
    gid2name = read_name(sys.argv[3])
    print 'name.me...'
    sys.stdout.flush()
    name_me = read_me(sys.argv[4])
    kvg = KVEngine()
    print 'binfo...'
    sys.stdout.flush()
    kvg.load([full_path('goods_binfo.kv')])
    kvg.load([full_path('goods_price.kv')])

    print 'shop limit...'
    sys.stdout.flush()
    same_shop_limit(gid2simi, kvg)
    print 'mutually_exclusive_names...'
    sys.stdout.flush()
    mutually_exclusive_names(gid2simi, gid2name, name_me)
    print 'price adjustment...'
    sys.stdout.flush()
    price_adj(gid2simi, kvg)

    print 'output...'
    sys.stdout.flush()
    output(sys.argv[5], gid2simi)
def main():
    kvg = KVEngine()
    kvg.load([full_path('cfgg.kv')])
    kvg.load([full_path('goods_binfo.kv')])
    keys = kvg.keymatch('G\d+_CFSIMG')
    sample_keys = random.sample(keys, 50)
    html = StringIO()
    html.write(
        '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>'
    )

    for key in sample_keys:
        similar_goods = int_float_dict(kvg.getd(key))
        items = similar_goods.items()
        items.sort(reverse=True, key=lambda x: x[1])
        items = [item for item in items if ('G%s-BINFO' % item[0]) in kvg][:10]
        html.write(item_snippet(kvg, key, items))

    open('a.html', 'w').write(html.getvalue())
def main():
    kvg = KVEngine()
    kvg.load([full_path('cfgg.kv')])
    kvg.load([full_path('goods_binfo.kv')])
    keys = kvg.keymatch('G\d+_CFSIMG')
    sample_keys = random.sample(keys, 50)
    html = StringIO()
    html.write('<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>')

    for key in sample_keys:
        similar_goods = int_float_dict(kvg.getd(key))
        items = similar_goods.items()
        items.sort(reverse=True, key=lambda x:x[1])
        items = [item for item in items if ('G%s-BINFO' % item[0]) in kvg][:10]
        html.write(item_snippet(kvg, key, items))

    open('a.html', 'w').write(html.getvalue())
Beispiel #8
0
def compute_cfss():
    '''
    计算shop-shop相似关系矩阵。
    Input:
        shop_actu:用户对店铺做的动作
    Process: 
        取用户动作表示的shop向量,计算向量点积。
    Output:
        shop-shop 相似关系,cfss.kv
    '''
    # shop_actu -> shop-shop关系矩阵,并保存cfss.kv,shop\tshop:weight;
    kvg = KVEngine()
    kvg.load([full_path('shop_actu.kv')])

    # get normialized vectors
    shop_users = {}
    skeys = kvg.keymatch('S\d+_ACTU')
    for skey in skeys:
        sid = key_id(skey)
        vector = dict([(int(key), float(value))
                       for (key, value) in kvg.getd(skey).items()
                       if key and value])
        # tailor to top 20
        items = vector.items()
        items.sort(key=lambda x: x[1], reverse=True)
        items = items[:20]
        vector = dict(items)
        normalize(vector)
        shop_users[sid] = vector

    # similarity calculation
    shop_similarity = {}
    sids = shop_users.keys()
    sids.sort()
    l = len(sids)
    print "Calculating shop-shop similarity matrix, total %d..." % l
    for i in range(l):
        if i % 1000 == 0:
            print "%d" % i
            sys.stdout.flush()
        for j in range(i + 1, l):
            sim = norm_dot_product(shop_users[sids[i]], shop_users[sids[j]])
            if abs(sim) < 1e-5:
                continue
            shop_similarity.setdefault(sids[i], {})[sids[j]] = sim
            shop_similarity.setdefault(sids[j], {})[sids[i]] = sim

    # save as kvfile
    write_kv_dict(shop_similarity, 'S%s_CFSIMS', 'cfss.kv')
Beispiel #9
0
def compute_cfgg():
    '''
    计算goods-goods相似关系矩阵。
    Input:
        user_actg.kv -> goods_actu.kv:用户对店铺做的动作
    Process: 
        取用户动作表示的goods向量,计算向量点积。
    Output:
        goods-goods 相似关系,cfss.kv
    '''
    kvg = KVEngine()
    kvg.load([full_path('goods_actu.kv')])

    # get normialized vectors
    goods_users = {}
    gkeys = kvg.keymatch('G\d+_ACTU')
    for gkey in gkeys:
        gid = key_id(gkey)
        vector = dict([(int(key), float(value))
                       for (key, value) in kvg.getd(gkey).items()
                       if key and value])
        # tailor to top 20
        items = vector.items()
        items.sort(key=lambda x: x[1], reverse=True)
        items = items[:20]
        vector = dict(items)
        normalize(vector)
        goods_users[gid] = vector

    # similarity calculation
    goods_similarity = {}
    gids = goods_users.keys()
    gids.sort()
    l = len(gids)
    print "Calculating goods-goods similarity matrix, total %d..." % l
    for i in range(l):
        if i % 100 == 0:
            print "%d" % i
            sys.stdout.flush()
        for j in range(i + 1, l):
            sim = norm_dot_product(goods_users[gids[i]], goods_users[gids[j]])
            if abs(sim) < 1e-5:
                continue
            goods_similarity.setdefault(gids[i], {})[gids[j]] = sim
            goods_similarity.setdefault(gids[j], {})[gids[i]] = sim

    # save as kvfile
    write_kv_dict(goods_similarity, 'G%s_CFSIMG', 'cfgg.kv')
Beispiel #10
0
def main():
    if len(sys.argv) != 4:
        print 'Usage: <goods.simi> <goods.docinfo> <output.html>'
        sys.exit(0)

    goods_simi = read_simi(sys.argv[1])
    kvg = KVEngine()
    kvg.load([full_path('goods_binfo.kv')])
    kvg.load([full_path('goods_price.kv')])
    kvg.load([full_path('goods_cat.kv')])

    html = StringIO()
    html.write(
        '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>'
    )

    case = 0
    for gid in goods_simi:
        title, imgurl, taobaourl, shopid, price, category = ginfo(kvg, gid)
        html.write(
            main_snippet(case, gid, title, imgurl, taobaourl, shopid, price,
                         category))
        case += 1
        items = goods_simi[gid].items()
        items.sort(key=lambda x: x[1], reverse=True)
        for subcase, item in enumerate(items):
            rgid, weight = item
            rtitle, rimgurl, rtaobaourl, shopid, price, category = ginfo(
                kvg, rgid)
            html.write(
                sub_snippet(subcase, rgid, rtitle, rimgurl, rtaobaourl, weight,
                            shopid, price, category))
            subcase += 1
            if subcase >= 10:
                break

    open(sys.argv[3], 'w').write(html.getvalue())
Beispiel #11
0
def compute_cfus():
    '''
    计算给用户推荐的店铺列表。
    Input: 
        cfss: 店铺关系
        user_favu: 用户关注店铺
        user_actu: 用户有动作店铺
    Process:
        从用户直接相关店铺出发,找这些店铺的相关店铺,再过滤。
    Output:
        存储CF算法产生的给用户推荐的店铺列表。cfus.kv
    '''
    kvg = KVEngine()
    kvg.load([full_path('cfss.kv')])
    kvg.load([full_path('user_favs.kv')])
    kvg.load([full_path('user_actu.kv')])
    kvg.load([full_path('shop_binfo.kv')])

    # get shop_similarity
    keys = kvg.keymatch('S\d+_CFSIMS')
    shop_similarity = dict([(int(key),
                             dict([(int(k), float(v))
                                   for (k, v) in kvg.getd(key).items()]))
                            for key in keys])

    # get user_fav_shops
    keys = kvg.keymatch('U\d+_FAVS')
    user_fav_shops = dict([(int(key), set([int(k) for k in kvg.getl(key)]))
                           for key in keys])

    # get blocked shop set
    keys = kvg.keymatch('S\d+_BINFO')
    blocked_shops = set()
    for key in keys:
        if kvg.getk(key, 'block') != '0':
            blocked_shops.add(key_id(key))

    # get user tags by fav shops
    shop_tags

    # get user_shops

    # shop idf

    # weigting and normalizing user_shops

    # 给每个用户做推荐
    print "Recommend for each user, total %d" % len(self.user_shops)
    sys.stdout.flush()
    for no, uid in enumerate(self.user_shops):
        shop_weight = {}  # 给该用户推荐的店铺列表及权重
        shops = self.user_shops[uid]  # 用户有动作的店铺列表
        fav_shops = self.user_fav_shops.get(uid, {})  # 用户关注的店铺
        if no % 1000 == 0:
            print "%d" % no
            sys.stdout.flush()

        for sid in shops:
            if sid not in self.shop_similarity:
                continue
            simi_shops = self.shop_similarity[sid]
            for ssid in simi_shops:
                if ssid in shop_weight:
                    shop_weight[ssid] += shops[sid] * simi_shops[ssid]
                else:
                    shop_weight[ssid] = shops[sid] * simi_shops[ssid]

        # 过滤shop_weight
        shop_weight_new = {}
        for sid in shop_weight:
            # 店铺sid是否适合推荐给用户uid
            if sid in fav_shops:
                continue  # 原本就关注
            if sid in self.shop_info and self.shop_info[sid][2] != 0:
                continue  # 店铺的block属性非0,被屏蔽,不使用
            if sid in self.shop_tags and uid in self.user_tags and \
                    self._tag_conflict(self.user_tags[uid], self.shop_tags[sid]):
                continue  # 用户关注店铺的类型与该店铺不符
            shop_weight_new[sid] = shop_weight[sid]

        if not shop_weight_new:
            continue  # 没有为此用户推荐一个店铺,都被过滤掉,不记录

        # 排序,取TOP
        normalize(shop_weight_new)
        items = shop_weight_new.items()
        items.sort(reverse=True, key=lambda x: x[1])  # sort by weight desc

        self.user_recommend_list[uid] = items[:TOP_SHOP_NUM]  # limit n
Beispiel #12
0
def compute_cfus():
    '''
    计算给用户推荐的店铺列表。
    Input: 
        cfss: 店铺关系
        user_favu: 用户关注店铺
        user_actu: 用户有动作店铺
    Process:
        从用户直接相关店铺出发,找这些店铺的相关店铺,再过滤。
    Output:
        存储CF算法产生的给用户推荐的店铺列表。cfus.kv
    '''
    kvg = KVEngine()
    kvg.load([full_path('cfss.kv')])
    kvg.load([full_path('user_favs.kv')])
    kvg.load([full_path('user_actu.kv')])
    kvg.load([full_path('shop_binfo.kv')])

    # get shop_similarity
    keys = kvg.keymatch('S\d+_CFSIMS')
    shop_similarity = dict([(int(key), dict([(int(k), float(v)) for (k, v) in kvg.getd(key).items()])) for key in keys])

    # get user_fav_shops
    keys = kvg.keymatch('U\d+_FAVS')
    user_fav_shops = dict([(int(key), set([int(k) for k in kvg.getl(key)])) for key in keys])

    # get blocked shop set
    keys = kvg.keymatch('S\d+_BINFO')
    blocked_shops = set()
    for key in keys:
        if kvg.getk(key, 'block') != '0':
            blocked_shops.add(key_id(key))

    # get user tags by fav shops
    shop_tags

    # get user_shops

    # shop idf

    # weigting and normalizing user_shops

    # 给每个用户做推荐
    print "Recommend for each user, total %d" % len(self.user_shops)
    sys.stdout.flush()
    for no, uid in enumerate(self.user_shops):
        shop_weight = {} # 给该用户推荐的店铺列表及权重
        shops = self.user_shops[uid] # 用户有动作的店铺列表
        fav_shops = self.user_fav_shops.get(uid, {}) # 用户关注的店铺
        if no % 1000 == 0:
            print "%d" % no
            sys.stdout.flush()

        for sid in shops:
            if sid not in self.shop_similarity:
                continue
            simi_shops = self.shop_similarity[sid]
            for ssid in simi_shops:
                if ssid in shop_weight:
                    shop_weight[ssid] += shops[sid] * simi_shops[ssid]
                else:
                    shop_weight[ssid] = shops[sid] * simi_shops[ssid]
        
        # 过滤shop_weight
        shop_weight_new = {}
        for sid in shop_weight:
            # 店铺sid是否适合推荐给用户uid
            if sid in fav_shops:
                continue # 原本就关注
            if sid in self.shop_info and self.shop_info[sid][2] != 0:
                continue # 店铺的block属性非0,被屏蔽,不使用
            if sid in self.shop_tags and uid in self.user_tags and \
                    self._tag_conflict(self.user_tags[uid], self.shop_tags[sid]):
                continue # 用户关注店铺的类型与该店铺不符
            shop_weight_new[sid] = shop_weight[sid]

        if not shop_weight_new:
            continue # 没有为此用户推荐一个店铺,都被过滤掉,不记录

        # 排序,取TOP
        normalize(shop_weight_new)
        items = shop_weight_new.items()
        items.sort(reverse=True, key=lambda x: x[1]) # sort by weight desc

        self.user_recommend_list[uid] = items[:TOP_SHOP_NUM] # limit n