def main(): if len(sys.argv) != 6: print 'Usage: <goods.simi> <goods.docinfo> <goods.name> <name.me> <goods.simi.output>' sys.exit(0) print 'docinfo...'; sys.stdout.flush() gidx2id = read_docinfo(sys.argv[2]) print 'simi...'; sys.stdout.flush() gid2simi = read_simi(sys.argv[1], gidx2id) print 'name...'; sys.stdout.flush() gid2name = read_name(sys.argv[3]) print 'name.me...'; sys.stdout.flush() name_me = read_me(sys.argv[4]) kvg = KVEngine() print 'binfo...'; sys.stdout.flush() kvg.load([full_path('goods_binfo.kv')]) kvg.load([full_path('goods_price.kv')]) print 'shop limit...'; sys.stdout.flush() same_shop_limit(gid2simi, kvg) print 'mutually_exclusive_names...'; sys.stdout.flush() mutually_exclusive_names(gid2simi, gid2name, name_me) print 'price adjustment...'; sys.stdout.flush() price_adj(gid2simi, kvg) print 'output...'; sys.stdout.flush() output(sys.argv[5], gid2simi)
def main(): if len(sys.argv) != 4: print "Usage: <goods.simi> <goods.docinfo> <output.html>" sys.exit(0) goods_simi = read_simi(sys.argv[1]) kvg = KVEngine() kvg.load([full_path("goods_binfo.kv")]) kvg.load([full_path("goods_price.kv")]) kvg.load([full_path("goods_cat.kv")]) html = StringIO() html.write('<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>') case = 0 for gid in goods_simi: title, imgurl, taobaourl, shopid, price, category = ginfo(kvg, gid) html.write(main_snippet(case, gid, title, imgurl, taobaourl, shopid, price, category)) case += 1 items = goods_simi[gid].items() items.sort(key=lambda x: x[1], reverse=True) for subcase, item in enumerate(items): rgid, weight = item rtitle, rimgurl, rtaobaourl, shopid, price, category = ginfo(kvg, rgid) html.write(sub_snippet(subcase, rgid, rtitle, rimgurl, rtaobaourl, weight, shopid, price, category)) subcase += 1 if subcase >= 10: break open(sys.argv[3], "w").write(html.getvalue())
def compute_cfss(): ''' 计算shop-shop相似关系矩阵。 Input: shop_actu:用户对店铺做的动作 Process: 取用户动作表示的shop向量,计算向量点积。 Output: shop-shop 相似关系,cfss.kv ''' # shop_actu -> shop-shop关系矩阵,并保存cfss.kv,shop\tshop:weight; kvg = KVEngine() kvg.load([full_path('shop_actu.kv')]) # get normialized vectors shop_users = {} skeys = kvg.keymatch('S\d+_ACTU') for skey in skeys: sid = key_id(skey) vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(skey).items() if key and value]) # tailor to top 20 items = vector.items() items.sort(key=lambda x:x[1], reverse=True) items = items[:20] vector = dict(items) normalize(vector) shop_users[sid] = vector # similarity calculation shop_similarity = {} sids = shop_users.keys() sids.sort() l = len(sids) print "Calculating shop-shop similarity matrix, total %d..." % l for i in range(l): if i % 1000 == 0: print "%d" % i sys.stdout.flush() for j in range(i+1, l): sim = norm_dot_product(shop_users[sids[i]], shop_users[sids[j]]) if abs(sim) < 1e-5: continue shop_similarity.setdefault(sids[i], {})[sids[j]] = sim shop_similarity.setdefault(sids[j], {})[sids[i]] = sim # save as kvfile write_kv_dict(shop_similarity, 'S%s_CFSIMS', 'cfss.kv')
def compute_cfgg(): ''' 计算goods-goods相似关系矩阵。 Input: user_actg.kv -> goods_actu.kv:用户对店铺做的动作 Process: 取用户动作表示的goods向量,计算向量点积。 Output: goods-goods 相似关系,cfss.kv ''' kvg = KVEngine() kvg.load([full_path('goods_actu.kv')]) # get normialized vectors goods_users = {} gkeys = kvg.keymatch('G\d+_ACTU') for gkey in gkeys: gid = key_id(gkey) vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(gkey).items() if key and value]) # tailor to top 20 items = vector.items() items.sort(key=lambda x:x[1], reverse=True) items = items[:20] vector = dict(items) normalize(vector) goods_users[gid] = vector # similarity calculation goods_similarity = {} gids = goods_users.keys() gids.sort() l = len(gids) print "Calculating goods-goods similarity matrix, total %d..." % l for i in range(l): if i % 100 == 0: print "%d" % i sys.stdout.flush() for j in range(i+1, l): sim = norm_dot_product(goods_users[gids[i]], goods_users[gids[j]]) if abs(sim) < 1e-5: continue goods_similarity.setdefault(gids[i], {})[gids[j]] = sim goods_similarity.setdefault(gids[j], {})[gids[i]] = sim # save as kvfile write_kv_dict(goods_similarity, 'G%s_CFSIMG', 'cfgg.kv')
def main(): if len(sys.argv) != 6: print 'Usage: <goods.simi> <goods.docinfo> <goods.name> <name.me> <goods.simi.output>' sys.exit(0) print 'docinfo...' sys.stdout.flush() gidx2id = read_docinfo(sys.argv[2]) print 'simi...' sys.stdout.flush() gid2simi = read_simi(sys.argv[1], gidx2id) print 'name...' sys.stdout.flush() gid2name = read_name(sys.argv[3]) print 'name.me...' sys.stdout.flush() name_me = read_me(sys.argv[4]) kvg = KVEngine() print 'binfo...' sys.stdout.flush() kvg.load([full_path('goods_binfo.kv')]) kvg.load([full_path('goods_price.kv')]) print 'shop limit...' sys.stdout.flush() same_shop_limit(gid2simi, kvg) print 'mutually_exclusive_names...' sys.stdout.flush() mutually_exclusive_names(gid2simi, gid2name, name_me) print 'price adjustment...' sys.stdout.flush() price_adj(gid2simi, kvg) print 'output...' sys.stdout.flush() output(sys.argv[5], gid2simi)
def main(): kvg = KVEngine() kvg.load([full_path('cfgg.kv')]) kvg.load([full_path('goods_binfo.kv')]) keys = kvg.keymatch('G\d+_CFSIMG') sample_keys = random.sample(keys, 50) html = StringIO() html.write( '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>' ) for key in sample_keys: similar_goods = int_float_dict(kvg.getd(key)) items = similar_goods.items() items.sort(reverse=True, key=lambda x: x[1]) items = [item for item in items if ('G%s-BINFO' % item[0]) in kvg][:10] html.write(item_snippet(kvg, key, items)) open('a.html', 'w').write(html.getvalue())
def main(): kvg = KVEngine() kvg.load([full_path('cfgg.kv')]) kvg.load([full_path('goods_binfo.kv')]) keys = kvg.keymatch('G\d+_CFSIMG') sample_keys = random.sample(keys, 50) html = StringIO() html.write('<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>') for key in sample_keys: similar_goods = int_float_dict(kvg.getd(key)) items = similar_goods.items() items.sort(reverse=True, key=lambda x:x[1]) items = [item for item in items if ('G%s-BINFO' % item[0]) in kvg][:10] html.write(item_snippet(kvg, key, items)) open('a.html', 'w').write(html.getvalue())
def compute_cfss(): ''' 计算shop-shop相似关系矩阵。 Input: shop_actu:用户对店铺做的动作 Process: 取用户动作表示的shop向量,计算向量点积。 Output: shop-shop 相似关系,cfss.kv ''' # shop_actu -> shop-shop关系矩阵,并保存cfss.kv,shop\tshop:weight; kvg = KVEngine() kvg.load([full_path('shop_actu.kv')]) # get normialized vectors shop_users = {} skeys = kvg.keymatch('S\d+_ACTU') for skey in skeys: sid = key_id(skey) vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(skey).items() if key and value]) # tailor to top 20 items = vector.items() items.sort(key=lambda x: x[1], reverse=True) items = items[:20] vector = dict(items) normalize(vector) shop_users[sid] = vector # similarity calculation shop_similarity = {} sids = shop_users.keys() sids.sort() l = len(sids) print "Calculating shop-shop similarity matrix, total %d..." % l for i in range(l): if i % 1000 == 0: print "%d" % i sys.stdout.flush() for j in range(i + 1, l): sim = norm_dot_product(shop_users[sids[i]], shop_users[sids[j]]) if abs(sim) < 1e-5: continue shop_similarity.setdefault(sids[i], {})[sids[j]] = sim shop_similarity.setdefault(sids[j], {})[sids[i]] = sim # save as kvfile write_kv_dict(shop_similarity, 'S%s_CFSIMS', 'cfss.kv')
def compute_cfgg(): ''' 计算goods-goods相似关系矩阵。 Input: user_actg.kv -> goods_actu.kv:用户对店铺做的动作 Process: 取用户动作表示的goods向量,计算向量点积。 Output: goods-goods 相似关系,cfss.kv ''' kvg = KVEngine() kvg.load([full_path('goods_actu.kv')]) # get normialized vectors goods_users = {} gkeys = kvg.keymatch('G\d+_ACTU') for gkey in gkeys: gid = key_id(gkey) vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(gkey).items() if key and value]) # tailor to top 20 items = vector.items() items.sort(key=lambda x: x[1], reverse=True) items = items[:20] vector = dict(items) normalize(vector) goods_users[gid] = vector # similarity calculation goods_similarity = {} gids = goods_users.keys() gids.sort() l = len(gids) print "Calculating goods-goods similarity matrix, total %d..." % l for i in range(l): if i % 100 == 0: print "%d" % i sys.stdout.flush() for j in range(i + 1, l): sim = norm_dot_product(goods_users[gids[i]], goods_users[gids[j]]) if abs(sim) < 1e-5: continue goods_similarity.setdefault(gids[i], {})[gids[j]] = sim goods_similarity.setdefault(gids[j], {})[gids[i]] = sim # save as kvfile write_kv_dict(goods_similarity, 'G%s_CFSIMG', 'cfgg.kv')
def main(): if len(sys.argv) != 4: print 'Usage: <goods.simi> <goods.docinfo> <output.html>' sys.exit(0) goods_simi = read_simi(sys.argv[1]) kvg = KVEngine() kvg.load([full_path('goods_binfo.kv')]) kvg.load([full_path('goods_price.kv')]) kvg.load([full_path('goods_cat.kv')]) html = StringIO() html.write( '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>' ) case = 0 for gid in goods_simi: title, imgurl, taobaourl, shopid, price, category = ginfo(kvg, gid) html.write( main_snippet(case, gid, title, imgurl, taobaourl, shopid, price, category)) case += 1 items = goods_simi[gid].items() items.sort(key=lambda x: x[1], reverse=True) for subcase, item in enumerate(items): rgid, weight = item rtitle, rimgurl, rtaobaourl, shopid, price, category = ginfo( kvg, rgid) html.write( sub_snippet(subcase, rgid, rtitle, rimgurl, rtaobaourl, weight, shopid, price, category)) subcase += 1 if subcase >= 10: break open(sys.argv[3], 'w').write(html.getvalue())
def compute_cfus(): ''' 计算给用户推荐的店铺列表。 Input: cfss: 店铺关系 user_favu: 用户关注店铺 user_actu: 用户有动作店铺 Process: 从用户直接相关店铺出发,找这些店铺的相关店铺,再过滤。 Output: 存储CF算法产生的给用户推荐的店铺列表。cfus.kv ''' kvg = KVEngine() kvg.load([full_path('cfss.kv')]) kvg.load([full_path('user_favs.kv')]) kvg.load([full_path('user_actu.kv')]) kvg.load([full_path('shop_binfo.kv')]) # get shop_similarity keys = kvg.keymatch('S\d+_CFSIMS') shop_similarity = dict([(int(key), dict([(int(k), float(v)) for (k, v) in kvg.getd(key).items()])) for key in keys]) # get user_fav_shops keys = kvg.keymatch('U\d+_FAVS') user_fav_shops = dict([(int(key), set([int(k) for k in kvg.getl(key)])) for key in keys]) # get blocked shop set keys = kvg.keymatch('S\d+_BINFO') blocked_shops = set() for key in keys: if kvg.getk(key, 'block') != '0': blocked_shops.add(key_id(key)) # get user tags by fav shops shop_tags # get user_shops # shop idf # weigting and normalizing user_shops # 给每个用户做推荐 print "Recommend for each user, total %d" % len(self.user_shops) sys.stdout.flush() for no, uid in enumerate(self.user_shops): shop_weight = {} # 给该用户推荐的店铺列表及权重 shops = self.user_shops[uid] # 用户有动作的店铺列表 fav_shops = self.user_fav_shops.get(uid, {}) # 用户关注的店铺 if no % 1000 == 0: print "%d" % no sys.stdout.flush() for sid in shops: if sid not in self.shop_similarity: continue simi_shops = self.shop_similarity[sid] for ssid in simi_shops: if ssid in shop_weight: shop_weight[ssid] += shops[sid] * simi_shops[ssid] else: shop_weight[ssid] = shops[sid] * simi_shops[ssid] # 过滤shop_weight shop_weight_new = {} for sid in shop_weight: # 店铺sid是否适合推荐给用户uid if sid in fav_shops: continue # 原本就关注 if sid in self.shop_info and self.shop_info[sid][2] != 0: continue # 店铺的block属性非0,被屏蔽,不使用 if sid in self.shop_tags and uid in self.user_tags and \ self._tag_conflict(self.user_tags[uid], self.shop_tags[sid]): continue # 用户关注店铺的类型与该店铺不符 shop_weight_new[sid] = shop_weight[sid] if not shop_weight_new: continue # 没有为此用户推荐一个店铺,都被过滤掉,不记录 # 排序,取TOP normalize(shop_weight_new) items = shop_weight_new.items() items.sort(reverse=True, key=lambda x: x[1]) # sort by weight desc self.user_recommend_list[uid] = items[:TOP_SHOP_NUM] # limit n