def comp_sim_shops(): # categories to bin # None means, just use all that show up cats = None # info to maintain about the stores keys = ['shop_id','shop_name'] # build shop category distribution for each store shop_d = ShopVec('data/shops_train.json', keys=keys) dat = pd.DataFrame([s for s in shop_d.category_dist(cats)]).fillna(0) # extract col names if cats is None: cats = [c for c in dat.columns if c not in keys] # pass data to similarity engine # hides the guts of splitting out process workers # default metric is cosine similarity ds = DistribSimil(dat[cats].values) pair_d = {} for pairs in ds.all_top_pairs(): shop = dat.ix[pairs[0],'shop_name'] sim_shop = dat.ix[pairs[1],'shop_name'] try: pair_d[shop].append(sim_shop) except: pair_d[shop] = [sim_shop] return pair_d
def comp_sim_shops(): # info to maintain about the stores keys = ['shop_id','shop_name'] # build shop category distribution for each store shop_d = ShopVec('data/shops_train.json', keys=keys) dat_gen = shop_d.data_toggle(cat_dist=True, title=True, product_title=True, tags=True) dat = pd.DataFrame([s for s in dat_gen]) # extract category names not_cats = keys + shop_d.nonnumeric_cols cats = [c for c in dat.columns if c not in not_cats] dat[cats] = dat[cats].fillna(0) # pass data to similarity engine # hides the guts of splitting out process workers # default metric is cosine similarity print 'Computing listing category similarity' ds = DistribSimil(dat[cats].values, njobs=20) pair_d = {} for pairs in ds.all_top_pairs(top=200): shop = dat.ix[pairs[0],'shop_name'] sim_shop = dat.ix[pairs[1],'shop_name'] try: pair_d[shop][sim_shop] = pairs[2] except: pair_d[shop] = { sim_shop: pairs[2] } print 'Vectorizing shop and listing docs' shop_vec = ShopTfIdf( max_features=None, ngram_range=(1,2), min_df=0.01, max_df=0.8 ) X_t = shop_vec.fit_transform(dat['doc']) print 'Computing shop and listing doc similarity' ds = DistribSimil(X_t) for pairs in ds.all_top_pairs(top=200): shop = dat.ix[pairs[0],'shop_name'] sim_shop = dat.ix[pairs[1],'shop_name'] try: pair_d[shop][sim_shop] += pairs[2] except: pair_d[shop][sim_shop] = pairs[2] # extract top 5 for each pair_d_t = { } for ref,comps in pair_d.iteritems(): pair_d_t[ref] = [c[0] for c in sorted(comps.iteritems(), key=lambda r: -r[1])[:5]] return pair_d_t