def global_info(city, standalone=False): """Gather global statistics about `city`.""" lvenues = geo_project(city, DB.venue.find({'city': city}, {'loc': 1})) lcheckins = geo_project(city, DB.checkin.find({'city': city}, {'loc': 1})) lphotos = geo_project(city, CLIENT.world.photos.find({'hint': city}, {'loc': 1})) local_projection = [lvenues, lcheckins, lphotos] visits = xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = xp.get_visitors(CLIENT, city) density = estimate_density(city) activity = [visits, visitors, density] global TOP_CATS TOP_CATS = p.load_var('top_cats.my') infos = {'venue': [] if standalone else ['cat', 'cats'], 'photo': ['taken'] if standalone else ['venue']} svenues = s.Surrounding(DB.venue, {'city': city}, infos['venue'], lvenues) scheckins = s.Surrounding(DB.checkin, {'city': city}, ['time'], lcheckins) sphotos = s.Surrounding(CLIENT.world.photos, {'hint': city}, infos['photo'], lphotos) surroundings = [svenues, scheckins, sphotos] p.save_var('{}_s{}s.my'.format(city, 'venue'), svenues) if standalone: for name, var in zip(['venue', 'checkin', 'photo'], surroundings): p.save_var('{}_s{}s.my'.format(city, name), var) return local_projection + activity + surroundings
def venues_info(vids, visits=None, visitors=None, density=None, depth=10, tags_freq=True): """Return various info about from the venue ids `vids`.""" tags = defaultdict(int) city = DB.venue.find_one({'_id': vids[0]})['city'] visits = visits or xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = visitors or xp.get_visitors(CLIENT, city) density = density or estimate_density(city) venues = list(DB.venue.find({'_id': {'$in': vids}}, {'cat': 1, 'name': 1, 'loc': 1, 'price': 1, 'rating': 1, 'tags': 1, 'likes': 1, 'usersCount': 1, 'checkinsCount': 1})) msg = 'Asked for {} but get only {}'.format(len(vids), len(venues)) assert len(vids) == len(venues), msg res = pd.DataFrame(index=[_['_id'] for _ in venues]) def add_col(field): res[field.replace('Count', '')] = [_[field] for _ in venues] for field in ['name', 'price', 'rating', 'likes', 'usersCount', 'checkinsCount']: add_col(field) if tags_freq: res['tags'] = [[normalized_tag(t) for t in _['tags']] for _ in venues] loc = [_['loc']['coordinates'] for _ in venues] get_cat = lambda c, d: top_category(c) if d == 1 else parenting_cat(c, d) res['cat'] = [get_cat(_['cat'], depth) for _ in venues] res['vis'] = [len(visits[id_]) for id_ in res.index] res['H'] = [venue_entropy(visitors[id_]) for id_ in res.index] res['Ht'] = [time_entropy(visits[id_]) for id_ in res.index] coords = np.fliplr(np.array(loc)) points = cm.cities.GEO_TO_2D[city](coords) res['Den'] = density(points) if tags_freq: for venue in venues: for tag in venue['tags']: tags[normalized_tag(tag)] += 1 return res, OrderedDict(sorted(tags.iteritems(), key=lambda x: x[1], reverse=True))
clusterer = cl.DBSCAN(eps=5, min_samples=8, metric='cityblock') clusterer = cl.AffinityPropagation(damping=.55, affinity='euclidean') clusterer = cl.SpectralClustering(3, affinity='cosine', n_init=3) hel = cn.load_matrix(city) features = hel['v'] scale = pp.MinMaxScaler(copy=False) scale.fit_transform(features[:, 0:3]) scores = [] for k in range(3, 16): clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500) labels = clusterer.fit_predict(features) scores.append(mt.silhouette_score(features, labels)) print(Counter(labels)) np.argsort(scores)[::-1]+3 ppl.plot(range(3, 16), scores[0:], '+') clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False) clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500) visits = xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = xp.get_visitors(CLIENT, city) density = vf.estimate_density(city) c0, _ = vf.venues_info([_ for _ in hel['i'][labels == 0].tolist() if _ in visits], visits, visitors, density, depth=2, tags_freq=False) c5, v = vf.venues_info([v for v in hel['i'][labels == 5].tolist() if v in visits], visits, visitors, density, depth=2, tags_freq=False) c0.describe() c5.describe()
features = hel['v'] scale = pp.MinMaxScaler(copy=False) scale.fit_transform(features[:, 0:3]) scores = [] for k in range(3, 16): clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500) labels = clusterer.fit_predict(features) scores.append(mt.silhouette_score(features, labels)) print(Counter(labels)) np.argsort(scores)[::-1] + 3 ppl.plot(range(3, 16), scores[0:], '+') clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False) clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500) visits = xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = xp.get_visitors(CLIENT, city) density = vf.estimate_density(city) c0, _ = vf.venues_info( [_ for _ in hel['i'][labels == 0].tolist() if _ in visits], visits, visitors, density, depth=2, tags_freq=False) c5, v = vf.venues_info( [v for v in hel['i'][labels == 5].tolist() if v in visits], visits, visitors, density, depth=2, tags_freq=False)