Esempio n. 1
0
 def comp_sim_shops():
   # categories to bin
   # None means, just use all that show up
   cats = None
   
   # info to maintain about the stores
   keys = ['shop_id','shop_name']
   
   # build shop category distribution for each store
   shop_d = ShopVec('data/shops_train.json', keys=keys)
   dat = pd.DataFrame([s for s in shop_d.category_dist(cats)]).fillna(0)
   
   # extract col names
   if cats is None:
     cats = [c for c in dat.columns if c not in keys]
   
   # pass data to similarity engine
   # hides the guts of splitting out process workers
   # default metric is cosine similarity
   ds = DistribSimil(dat[cats].values)
   
   pair_d = {}
   for pairs in ds.all_top_pairs():
     shop = dat.ix[pairs[0],'shop_name']
     sim_shop = dat.ix[pairs[1],'shop_name']
     try:
       pair_d[shop].append(sim_shop)
     except:
       pair_d[shop] = [sim_shop]
 
   return pair_d
Esempio n. 2
0
  def comp_sim_shops():
    
    # info to maintain about the stores
    keys = ['shop_id','shop_name']
    
    # build shop category distribution for each store
    shop_d = ShopVec('data/shops_train.json', keys=keys)
    dat_gen = shop_d.data_toggle(cat_dist=True, title=True, product_title=True, tags=True)
    dat = pd.DataFrame([s for s in dat_gen])
    
    # extract category names
    not_cats = keys + shop_d.nonnumeric_cols
    cats = [c for c in dat.columns if c not in not_cats]
    dat[cats] = dat[cats].fillna(0)
    
    # pass data to similarity engine
    # hides the guts of splitting out process workers
    # default metric is cosine similarity
    
    print 'Computing listing category similarity'
    ds = DistribSimil(dat[cats].values, njobs=20)
    
    pair_d = {}
    for pairs in ds.all_top_pairs(top=200):
      shop = dat.ix[pairs[0],'shop_name']
      sim_shop = dat.ix[pairs[1],'shop_name']
      try:
        pair_d[shop][sim_shop] = pairs[2]
      except:
        pair_d[shop] = { sim_shop: pairs[2] }
        
    
    print 'Vectorizing shop and listing docs'
    shop_vec = ShopTfIdf(
      max_features=None,
      ngram_range=(1,2),
      min_df=0.01,
      max_df=0.8
    )
  
    X_t = shop_vec.fit_transform(dat['doc'])
    
    print 'Computing shop and listing doc similarity'
    ds = DistribSimil(X_t)

    for pairs in ds.all_top_pairs(top=200):
      shop = dat.ix[pairs[0],'shop_name']
      sim_shop = dat.ix[pairs[1],'shop_name']
      try:
        pair_d[shop][sim_shop] += pairs[2]
      except:
        pair_d[shop][sim_shop] = pairs[2]
        
    # extract top 5 for each
    pair_d_t = { }
    for ref,comps in pair_d.iteritems():
      pair_d_t[ref] = [c[0] for c in sorted(comps.iteritems(), key=lambda r: -r[1])[:5]]
        
    return pair_d_t