def compute_cfss(): ''' 计算shop-shop相似关系矩阵。 Input: shop_actu:用户对店铺做的动作 Process: 取用户动作表示的shop向量,计算向量点积。 Output: shop-shop 相似关系,cfss.kv ''' # shop_actu -> shop-shop关系矩阵,并保存cfss.kv,shop\tshop:weight; kvg = KVEngine() kvg.load([full_path('shop_actu.kv')]) # get normialized vectors shop_users = {} skeys = kvg.keymatch('S\d+_ACTU') for skey in skeys: sid = key_id(skey) vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(skey).items() if key and value]) # tailor to top 20 items = vector.items() items.sort(key=lambda x: x[1], reverse=True) items = items[:20] vector = dict(items) normalize(vector) shop_users[sid] = vector # similarity calculation shop_similarity = {} sids = shop_users.keys() sids.sort() l = len(sids) print "Calculating shop-shop similarity matrix, total %d..." % l for i in range(l): if i % 1000 == 0: print "%d" % i sys.stdout.flush() for j in range(i + 1, l): sim = norm_dot_product(shop_users[sids[i]], shop_users[sids[j]]) if abs(sim) < 1e-5: continue shop_similarity.setdefault(sids[i], {})[sids[j]] = sim shop_similarity.setdefault(sids[j], {})[sids[i]] = sim # save as kvfile write_kv_dict(shop_similarity, 'S%s_CFSIMS', 'cfss.kv')
def compute_cfgg(): ''' 计算goods-goods相似关系矩阵。 Input: user_actg.kv -> goods_actu.kv:用户对店铺做的动作 Process: 取用户动作表示的goods向量,计算向量点积。 Output: goods-goods 相似关系,cfss.kv ''' kvg = KVEngine() kvg.load([full_path('goods_actu.kv')]) # get normialized vectors goods_users = {} gkeys = kvg.keymatch('G\d+_ACTU') for gkey in gkeys: gid = key_id(gkey) vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(gkey).items() if key and value]) # tailor to top 20 items = vector.items() items.sort(key=lambda x: x[1], reverse=True) items = items[:20] vector = dict(items) normalize(vector) goods_users[gid] = vector # similarity calculation goods_similarity = {} gids = goods_users.keys() gids.sort() l = len(gids) print "Calculating goods-goods similarity matrix, total %d..." % l for i in range(l): if i % 100 == 0: print "%d" % i sys.stdout.flush() for j in range(i + 1, l): sim = norm_dot_product(goods_users[gids[i]], goods_users[gids[j]]) if abs(sim) < 1e-5: continue goods_similarity.setdefault(gids[i], {})[gids[j]] = sim goods_similarity.setdefault(gids[j], {})[gids[i]] = sim # save as kvfile write_kv_dict(goods_similarity, 'G%s_CFSIMG', 'cfgg.kv')
def compute_cfss(): ''' 计算shop-shop相似关系矩阵。 Input: shop_actu:用户对店铺做的动作 Process: 取用户动作表示的shop向量,计算向量点积。 Output: shop-shop 相似关系,cfss.kv ''' # shop_actu -> shop-shop关系矩阵,并保存cfss.kv,shop\tshop:weight; kvg = KVEngine() kvg.load([full_path('shop_actu.kv')]) # get normialized vectors shop_users = {} skeys = kvg.keymatch('S\d+_ACTU') for skey in skeys: sid = key_id(skey) vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(skey).items() if key and value]) # tailor to top 20 items = vector.items() items.sort(key=lambda x:x[1], reverse=True) items = items[:20] vector = dict(items) normalize(vector) shop_users[sid] = vector # similarity calculation shop_similarity = {} sids = shop_users.keys() sids.sort() l = len(sids) print "Calculating shop-shop similarity matrix, total %d..." % l for i in range(l): if i % 1000 == 0: print "%d" % i sys.stdout.flush() for j in range(i+1, l): sim = norm_dot_product(shop_users[sids[i]], shop_users[sids[j]]) if abs(sim) < 1e-5: continue shop_similarity.setdefault(sids[i], {})[sids[j]] = sim shop_similarity.setdefault(sids[j], {})[sids[i]] = sim # save as kvfile write_kv_dict(shop_similarity, 'S%s_CFSIMS', 'cfss.kv')
def compute_cfgg(): ''' 计算goods-goods相似关系矩阵。 Input: user_actg.kv -> goods_actu.kv:用户对店铺做的动作 Process: 取用户动作表示的goods向量,计算向量点积。 Output: goods-goods 相似关系,cfss.kv ''' kvg = KVEngine() kvg.load([full_path('goods_actu.kv')]) # get normialized vectors goods_users = {} gkeys = kvg.keymatch('G\d+_ACTU') for gkey in gkeys: gid = key_id(gkey) vector = dict([(int(key), float(value)) for (key, value) in kvg.getd(gkey).items() if key and value]) # tailor to top 20 items = vector.items() items.sort(key=lambda x:x[1], reverse=True) items = items[:20] vector = dict(items) normalize(vector) goods_users[gid] = vector # similarity calculation goods_similarity = {} gids = goods_users.keys() gids.sort() l = len(gids) print "Calculating goods-goods similarity matrix, total %d..." % l for i in range(l): if i % 100 == 0: print "%d" % i sys.stdout.flush() for j in range(i+1, l): sim = norm_dot_product(goods_users[gids[i]], goods_users[gids[j]]) if abs(sim) < 1e-5: continue goods_similarity.setdefault(gids[i], {})[gids[j]] = sim goods_similarity.setdefault(gids[j], {})[gids[i]] = sim # save as kvfile write_kv_dict(goods_similarity, 'G%s_CFSIMG', 'cfgg.kv')
def worker(tasks, foutput, lock, groups): global rows, cid2rids name = current_process().name buf = StringIO() for no, rid in enumerate(tasks): if no % 100 == 0: lock.acquire() foutput.write(buf.getvalue()) foutput.flush() buf.truncate(0) lock.release() cids = rows[rid].keys() rrids = set() for cid in cids: rrids.update(cid2rids[cid]) rrids.remove(rid) # remove rrids from different category rgroups = groups.get(rid, set()) rrid2simi = {} small_rrids = set() for rrid in rrids: if not groups.get(rrid, set()).intersection(rgroups): continue small_rrids.add(rrid) #print >> sys.stderr, 'rrids: %d -> %d' % (len(rrids), len(small_rrids)) if not small_rrids: continue rrids = small_rrids for rrid in rrids: rrid2simi[rrid] = norm_dot_product(rows[rid], rows[rrid]) items = rrid2simi.items() items.sort(key=lambda x: x[1], reverse=True) items = [item for item in items if item[1] >= 0.1][:40] buf.write('%d %s\n' % (rid, ' '.join(['%d:%.4f' % item for item in items]))) else: if buf.tell() != 0: lock.acquire() foutput.write(buf.getvalue()) foutput.flush() buf.truncate(0) lock.release()
def worker(tasks, foutput, lock, groups): global rows, cid2rids name = current_process().name buf = StringIO() for no, rid in enumerate(tasks): if no % 100 == 0: lock.acquire() foutput.write(buf.getvalue()) foutput.flush() buf.truncate(0) lock.release() cids = rows[rid].keys() rrids = set() for cid in cids: rrids.update(cid2rids[cid]) rrids.remove(rid) # remove rrids from different category rgroups = groups.get(rid, set()) rrid2simi = {} small_rrids = set() for rrid in rrids: if not groups.get(rrid, set()).intersection(rgroups): continue small_rrids.add(rrid) #print >> sys.stderr, 'rrids: %d -> %d' % (len(rrids), len(small_rrids)) if not small_rrids: continue rrids = small_rrids for rrid in rrids: rrid2simi[rrid] = norm_dot_product(rows[rid], rows[rrid]) items = rrid2simi.items() items.sort(key=lambda x:x[1], reverse=True) items = [item for item in items if item[1]>=0.1][:40] buf.write('%d %s\n' % (rid, ' '.join(['%d:%.4f' % item for item in items]))) else: if buf.tell() != 0: lock.acquire() foutput.write(buf.getvalue()) foutput.flush() buf.truncate(0) lock.release()