def plot_city(city, weekly=False, clusters=5): """Plot the 5 time clusters of `city` and save them on disk.""" shift = 2 # start from 1am instead of midnight chunk = 4 venue_visits = xp.get_visits(CLIENT, xp.Entity.venue, city) # Compute aggregated frequency for venues with at least 5 visits enough = { k: xp.to_frequency(xp.aggregate_visits(v, shift, chunk)[int(weekly)]) for k, v in venue_visits.iteritems() if len(v) > 5 } sval = np.array(enough.values()) num_cluster = clusters min_disto = 1e9 for _ in range(7): tak, tkl = DO_CLUSTER(sval, num_cluster) current_disto = vf.get_distorsion(tak, tkl, sval) if current_disto < min_disto: min_disto, ak, kl = current_disto, tak, tkl std_ord = np.argsort((np.argsort(ak)), 0)[:, -1] # vf.draw_classes(ak[std_ord, :], shift, chunk) # vf.plt.title('{}, {} venues'.format(city, len(enough))) # vf.plt.ylim([0, 0.28 if weekly else 0.9]) city = 'times/' + city city += '_weekly' if weekly else '_daily' sio.savemat(city + '_time', {'t': ak[std_ord, :]}, do_compression=True)
def global_info(city, standalone=False): """Gather global statistics about `city`.""" lvenues = geo_project(city, DB.venue.find({'city': city}, {'loc': 1})) lcheckins = geo_project(city, DB.checkin.find({'city': city}, {'loc': 1})) lphotos = geo_project(city, CLIENT.world.photos.find({'hint': city}, {'loc': 1})) local_projection = [lvenues, lcheckins, lphotos] visits = xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = xp.get_visitors(CLIENT, city) density = estimate_density(city) activity = [visits, visitors, density] global TOP_CATS TOP_CATS = p.load_var('top_cats.my') infos = {'venue': [] if standalone else ['cat', 'cats'], 'photo': ['taken'] if standalone else ['venue']} svenues = s.Surrounding(DB.venue, {'city': city}, infos['venue'], lvenues) scheckins = s.Surrounding(DB.checkin, {'city': city}, ['time'], lcheckins) sphotos = s.Surrounding(CLIENT.world.photos, {'hint': city}, infos['photo'], lphotos) surroundings = [svenues, scheckins, sphotos] p.save_var('{}_s{}s.my'.format(city, 'venue'), svenues) if standalone: for name, var in zip(['venue', 'checkin', 'photo'], surroundings): p.save_var('{}_s{}s.my'.format(city, name), var) return local_projection + activity + surroundings
def plot_city(city, weekly=False, clusters=5): """Plot the 5 time clusters of `city` and save them on disk.""" shift = 2 # start from 1am instead of midnight chunk = 4 venue_visits = xp.get_visits(CLIENT, xp.Entity.venue, city) # Compute aggregated frequency for venues with at least 5 visits enough = { k: xp.to_frequency(xp.aggregate_visits(v, shift, chunk)[int(weekly)]) for k, v in venue_visits.iteritems() if len(v) > 5 } sval = np.array(enough.values()) num_cluster = clusters min_disto = 1e9 for _ in range(7): tak, tkl = DO_CLUSTER(sval, num_cluster) current_disto = vf.get_distorsion(tak, tkl, sval) if current_disto < min_disto: min_disto, ak, kl = current_disto, tak, tkl std_ord = np.argsort((np.argsort(ak)), 0)[:, -1] # vf.draw_classes(ak[std_ord, :], shift, chunk) # vf.plt.title('{}, {} venues'.format(city, len(enough))) # vf.plt.ylim([0, 0.28 if weekly else 0.9]) city = "times/" + city city += "_weekly" if weekly else "_daily" sio.savemat(city + "_time", {"t": ak[std_ord, :]}, do_compression=True)
def photos_around(id_, centroid, offset, daily, radius=200): """Gather photos timestamp in a `radius` around `id_` and return its time pattern (`daily` or not), and its distance to every `centroid`.""" center = get_loc(id_) photos = xp.get_visits(CLIENT, xp.Entity.photo, ball=(center, radius)) kind = xp.to_frequency(xp.aggregate_visits(photos.values(), offset)[daily]) nb_class = centroid.shape[0] # pylint: disable=E1101 classes = np.linalg.norm(np.tile(kind, (nb_class, 1)) - centroid, axis=1) return len(photos), kind, classes, np.argmin(classes)
def venues_info(vids, visits=None, visitors=None, density=None, depth=10, tags_freq=True): """Return various info about from the venue ids `vids`.""" tags = defaultdict(int) city = DB.venue.find_one({'_id': vids[0]})['city'] visits = visits or xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = visitors or xp.get_visitors(CLIENT, city) density = density or estimate_density(city) venues = list(DB.venue.find({'_id': {'$in': vids}}, {'cat': 1, 'name': 1, 'loc': 1, 'price': 1, 'rating': 1, 'tags': 1, 'likes': 1, 'usersCount': 1, 'checkinsCount': 1})) msg = 'Asked for {} but get only {}'.format(len(vids), len(venues)) assert len(vids) == len(venues), msg res = pd.DataFrame(index=[_['_id'] for _ in venues]) def add_col(field): res[field.replace('Count', '')] = [_[field] for _ in venues] for field in ['name', 'price', 'rating', 'likes', 'usersCount', 'checkinsCount']: add_col(field) if tags_freq: res['tags'] = [[normalized_tag(t) for t in _['tags']] for _ in venues] loc = [_['loc']['coordinates'] for _ in venues] get_cat = lambda c, d: top_category(c) if d == 1 else parenting_cat(c, d) res['cat'] = [get_cat(_['cat'], depth) for _ in venues] res['vis'] = [len(visits[id_]) for id_ in res.index] res['H'] = [venue_entropy(visitors[id_]) for id_ in res.index] res['Ht'] = [time_entropy(visits[id_]) for id_ in res.index] coords = np.fliplr(np.array(loc)) points = cm.cities.GEO_TO_2D[city](coords) res['Den'] = density(points) if tags_freq: for venue in venues: for tag in venue['tags']: tags[normalized_tag(tag)] += 1 return res, OrderedDict(sorted(tags.iteritems(), key=lambda x: x[1], reverse=True))
clusterer = cl.DBSCAN(eps=5, min_samples=8, metric='cityblock') clusterer = cl.AffinityPropagation(damping=.55, affinity='euclidean') clusterer = cl.SpectralClustering(3, affinity='cosine', n_init=3) hel = cn.load_matrix(city) features = hel['v'] scale = pp.MinMaxScaler(copy=False) scale.fit_transform(features[:, 0:3]) scores = [] for k in range(3, 16): clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500) labels = clusterer.fit_predict(features) scores.append(mt.silhouette_score(features, labels)) print(Counter(labels)) np.argsort(scores)[::-1]+3 ppl.plot(range(3, 16), scores[0:], '+') clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False) clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500) visits = xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = xp.get_visitors(CLIENT, city) density = vf.estimate_density(city) c0, _ = vf.venues_info([_ for _ in hel['i'][labels == 0].tolist() if _ in visits], visits, visitors, density, depth=2, tags_freq=False) c5, v = vf.venues_info([v for v in hel['i'][labels == 5].tolist() if v in visits], visits, visitors, density, depth=2, tags_freq=False) c0.describe() c5.describe()
hel = cn.load_matrix(city) features = hel['v'] scale = pp.MinMaxScaler(copy=False) scale.fit_transform(features[:, 0:3]) scores = [] for k in range(3, 16): clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500) labels = clusterer.fit_predict(features) scores.append(mt.silhouette_score(features, labels)) print(Counter(labels)) np.argsort(scores)[::-1] + 3 ppl.plot(range(3, 16), scores[0:], '+') clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False) clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500) visits = xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = xp.get_visitors(CLIENT, city) density = vf.estimate_density(city) c0, _ = vf.venues_info( [_ for _ in hel['i'][labels == 0].tolist() if _ in visits], visits, visitors, density, depth=2, tags_freq=False) c5, v = vf.venues_info( [v for v in hel['i'][labels == 5].tolist() if v in visits], visits, visitors, density, depth=2,